def main():
    test_ids = np.array([x[:-4] for x in os.listdir(args.test_folder) if x[-4:] == '.png'])

    MODEL_PATH = os.path.join(args.models_dir, args.network + args.alias)
    folds = [int(f) for f in args.fold.split(',')]

    print('Predicting Model:', args.network + args.alias)

    for fold in folds:
        K.clear_session()
        print('***************************** FOLD {} *****************************'.format(fold))

        # Initialize Model
        weights_path = os.path.join(MODEL_PATH, args.prediction_weights.format(fold))

        model, preprocess = get_model(args.network,
                                      input_shape=(args.input_size, args.input_size, 3),
                                      freeze_encoder=args.freeze_encoder)
        model.compile(optimizer=RMSprop(lr=args.learning_rate), loss=make_loss(args.loss_function),
                      metrics=[Kaggle_IoU_Precision])

        model.load_weights(weights_path)

        # Save test predictions to disk
        dir_path = os.path.join(MODEL_PATH, args.prediction_folder.format(fold))
        os.system("mkdir {}".format(dir_path))
        predict_test(model=model,
                     preds_path=dir_path,
                     ids=test_ids,
                     batch_size=args.batch_size * 2,
                     TTA='flip',
                     preprocess=preprocess)

        gc.collect()
Example #2
0
 def _init_params(self):
     self.net = get_net(self.config['model'], self.config['load_weights'])
     self.net.cuda()
     self.model = get_model(self.config['model'])
     self.criterion = get_loss(self.config['model'])
     self.optimizer = self._get_optim()
     self.scheduler = optim.lr_scheduler.MultiStepLR(
         self.optimizer, milestones=[40, 55, 70, 95], gamma=0.5)
Example #3
0
    def load_model(self):
        model_path = self.config.paths.model
        dataset = self.config.dataset
        logging.info('dataset: {}'.format(dataset))

        # Set up global model
        model = models.get_model(dataset)
        logging.debug(model)
        return model
Example #4
0
 def _init_params(self):
     self.criterionG, criterionD = get_loss(self.config['model'])
     self.netG, netD = get_nets(self.config['model'])
     self.netG.to(self.device)
     self.adv_trainer = self._get_adversarial_trainer(self.config['model']['d_name'], netD, criterionD)
     self.model = get_model(self.config['model'])
     self.optimizer_G = self._get_optim(filter(lambda p: p.requires_grad, self.netG.parameters()))
     self.optimizer_D = self._get_optim(self.adv_trainer.get_params())
     self.scheduler_G = self._get_scheduler(self.optimizer_G)
     self.scheduler_D = self._get_scheduler(self.optimizer_D)
Example #5
0
 def _init_params(self):
     self.criterionG, criterionD = get_loss(self.config['model'])
     self.netG, netD = get_nets(self.config['model'])
     self.netG.cuda()
     self.adv_trainer = self._get_adversarial_trainer(
         self.config['model']['d_name'], netD, criterionD)
     self.model = get_model(self.config['model'])
     self.optimizer_G = self._get_optim(
         filter(lambda p: p.requires_grad, self.netG.parameters()))
     self.optimizer_D = self._get_optim(self.adv_trainer.get_params())
     self.scheduler_G = self._get_scheduler(self.optimizer_G)
     self.scheduler_D = self._get_scheduler(self.optimizer_D)
     # load state dict
     self.netG.load_state_dict(
         torch.load("best_fpn.h5", map_location='cpu')['model'])
Example #6
0
    def _init_params(self):
        self.criterionG, criterionD = get_loss(self.config['model'])
        netG, netD = get_nets(self.config['model'])
        model = netG.cuda()

        ############ model 加载 继续训练
        checkpoint = torch.load('best_{}.h5'.format(
            self.config['experiment_desc']))
        model.load_state_dict(checkpoint['model'])
        self.netG = model

        self.adv_trainer = self._get_adversarial_trainer(
            self.config['model']['d_name'], netD, criterionD)
        self.model = get_model(self.config['model'])
        self.optimizer_G = self._get_optim(
            filter(lambda p: p.requires_grad, self.netG.parameters()))
        self.optimizer_D = self._get_optim(self.adv_trainer.get_params())
        self.scheduler_G = self._get_scheduler(self.optimizer_G)
        self.scheduler_D = self._get_scheduler(self.optimizer_D)
def main():
    parser = argparse.ArgumentParser(
        description='Binary MRI Quality Classification')
    parser.add_argument('--yaml_path',
                        type=str,
                        metavar='YAML',
                        default="config/acdc_binary_classification.yaml",
                        help='Enter the path for the YAML config')
    args = parser.parse_args()

    yaml.add_constructor("!join", yaml_var_concat)

    yaml_path = args.yaml_path
    with open(yaml_path, 'r') as f:
        train_args = yaml.load(f, Loader=yaml.Loader)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    composed = transforms.Compose([
        Resize((224, 224)),
        OneToThreeDimension(),
        ToTensor(),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    acdc_dataset = ACDCDataset(train_args["pos_samps_test"],
                               train_args["neg_samps_test"],
                               transform=composed)

    dataloader = DataLoader(acdc_dataset,
                            batch_size=train_args["batch_size"],
                            shuffle=False,
                            num_workers=4)
    dataset_size = len(acdc_dataset)

    model_ft = get_model(train_args["model"],
                         device,
                         pretrained=train_args["pretrained"])
    state = get_most_recent_model(train_args["model"],
                                  train_args["model_save_dir"])
    model_ft.load_state_dict(state)

    test(model_ft, dataloader, dataset_size, device=device)
Example #8
0
def keras_fit_generator():

    kfolds = [0]
    # kfolds = [2, 3, 4]
    for fold in kfolds:

        K.clear_session()
        print('fold = {}'.format(fold))
        print('begin load data')
        X_train, y_train, X_val, y_val = load_data(fold)
        print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)
        # (1250, 256, 256, 1) (1250, 256, 256, 1) (127, 256, 256, 1) (127, 256, 256, 1)
        print('load data over')


        model, process = get_model(network=network, input_shape=(X_train.shape[1], X_train.shape[2], X_train.shape[3]), freeze_encoder=False)

        model.load_weights(pretrain_weight + str(fold) + '.hdf5')

        val_gen = Generator(X_val, y_val, batch_size=len(y_val), shuffle=True, aug=True, process=process)
        X_val_steps, y_val_steps = next(val_gen.generator)

        train_gen = Generator(X_train, y_train, batch_size=batch_size, shuffle=True, aug=True, process=process)

        model.compile(optimizer=Adam(lr=learning_rate), loss=make_loss(loss_name=loss_function), metrics=[dice_coef])

        model.summary()

        c_backs = get_callback(callback, fold, num_sample=len(X_train))

        model.fit_generator(
                            train_gen.generator,
                            steps_per_epoch=(len(X_train)//batch_size)*2,
                            epochs=epochs,
                            verbose=1,
                            shuffle=True,
                            validation_data=(X_val_steps, y_val_steps),
                            callbacks=c_backs,
                            use_multiprocessing=False)
        gc.collect()
Example #9
0
def main(args):
    time_stamp = '{:.0f}'.format(time.time() % 100000)
    if torch.cuda.is_available() is True:
        logging.info('Utilizing GPU', extra=args.client)
        print('Utilizing GPU')

    train_loader, val_loader = load_data(args)
    model = get_model(args.model, args)

    if args.batch_size > 256:  # and args.dataset == 'imagenet' and args.model == 'resnet':
        batch_accumulate_num = args.batch_size // 256
    else:
        batch_accumulate_num = 1
    # create model
    if args.dataset == 'imagenet':
        if batch_accumulate_num > 1:
            args.iterations_per_epoch = len(train_loader.dataset.imgs) // 256
        else:
            args.iterations_per_epoch = len(train_loader.dataset.imgs) // args.batch_size
        val_len = len(val_loader.dataset.imgs) // 1024
    else:
        if batch_accumulate_num > 1:
            args.iterations_per_epoch = len(train_loader.dataset.train_labels) // 256
        else:
            args.iterations_per_epoch = len(train_loader.dataset.train_labels) // args.batch_size

        val_len = len(val_loader.dataset.test_labels) // 1024

    # get the number of model parameters
    log_str = 'Number of model parameters: {}'.format(sum([p.data.nelement() for p in model.parameters()]))
    logging.info(log_str, extra=args.client)
    print(log_str)
    # for training on multiple GPUs.
    model = torch.nn.DataParallel(model)
    model = model.cuda()
    server = ParameterServer.get_server(args.optimizer, model, args)
    val_statistics = Statistics.get_statistics('image_classification', args)
    train_statistics = Statistics.get_statistics('image_classification', args)
    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume + '/checkpoint.pth.tar'):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume + '/checkpoint.pth.tar')
            args.start_epoch = checkpoint['epoch']
            server = checkpoint['server']
            val_statistics = checkpoint['val_stats']
            train_statistics = checkpoint['train_stats']
            model.load_state_dict(checkpoint['state_dict'])
            print('=> loaded checkpoint {} (epoch {})'.format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True
    # define loss function (criterion) and optimizer
    loss_params = {}
    if args.label_smoothing > 0:
        loss_params['smooth_eps'] = args.label_smoothing
    criterion = getattr(model, 'criterion', CrossEntropyLoss)(**loss_params).cuda()
    # criterion = nn.CrossEntropyLoss().cuda()
    if args.bar is True:
        train_bar = IncrementalBar('Training  ', max=args.iterations_per_epoch, suffix='%(percent)d%%')
        val_bar = IncrementalBar('Evaluating', max=val_len, suffix='%(percent)d%%')
    else:
        train_bar = None
        val_bar = None
    log_str = '{}: Training neural network for {} epochs with {} workers'.format(args.id, args.epochs,
                                                                                 args.workers_num)
    logging.info(log_str, extra=args.client)
    print(log_str)
    train_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        # train for one epoch
        train_loss, train_error = train(train_loader, model, criterion, server, epoch, args.workers_num, args.grad_clip,
                                        batch_accumulate_num, train_bar, train_statistics, args.client)

        train_time = time.time() - train_time
        if args.bar is True:
            train_bar.finish()
            train_bar.index = 0

        # evaluate on validation set
        val_time = time.time()
        with torch.no_grad():
            val_loss, val_error = validate(val_loader, model, criterion, server, val_statistics, val_bar)
        train_statistics.save_loss(train_loss)
        train_statistics.save_error(train_error)
        train_statistics.save_weight_mean_dist(server.get_workers_mean_statistics())
        train_statistics.save_weight_master_dist(server.get_workers_master_statistics())
        train_statistics.save_mean_master_dist(server.get_mean_master_dist())
        train_statistics.save_weight_norm(server.get_server_weights())
        train_statistics.save_gradient_norm(server.get_server_gradients())
        val_time = time.time() - val_time
        if args.bar is True:
            val_bar.finish()
            val_bar.index = 0

        log_str = 'Epoch [{0:1d}]: Train: Time [{1:.2f}], Loss [{2:.3f}], Error[{3:.3f}] | ' \
                  'Test: Time [{4:.2f}], Loss [{5:.3f}], Error[{6:.3f}]'.format(epoch + 1, train_time, train_loss,
                                                                                train_error, val_time, val_loss,
                                                                                val_error)
        logging.info(log_str, extra=args.client)
        print(log_str)
        if epoch % args.save == 0 and epoch > 0:
            save_checkpoint({'epoch': epoch + 1,
                             'state_dict': model.state_dict(),
                             'val_stats': val_statistics,
                             'train_stats': train_statistics,
                             'server': server}, sim_name=(args.name + time_stamp + '_' + str(epoch)))
        train_time = time.time()

    return train_statistics, val_statistics
def train_model(args):
    train_samples, val_samples = get_num_samples(args.test_file, args.train_file)
    img_path = ''

    m.doFlip = args.doFlip
    m.doScale = args.doScale

    generator_test_batch, generator_train_batch, generator_val_batch, model = m.get_model(args)

    if os.path.isfile(args.save_path + '/weights.h5') and args.load_opt_train == 0:
        # model.summary()
        model.load_weights(args.save_path+ '/weights.h5')
        return model, generator_train_batch, generator_val_batch, generator_test_batch

    lr = 0.0005 # org
    lr = 0.05 # OK for C3D
    # lr = 0.005
    # opt = SGD(lr=lr, momentum=0.9, nesterov=True)
    opt = Adam(lr=lr, decay=0.9)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    # model.summary()

    # Save optimazer state
    if args.load_opt_train == 1:
        model.load_weights(args.save_path + '/weights.h5')
        model._make_train_function()
        with open(args.save_path + 'optimizer.pkl', 'rb') as f:
            weight_values = pickle.load(f)
        model.optimizer.set_weights(weight_values)

    print(args)
    history = model.fit_generator(generator_train_batch(args.train_file, args.batch_size, args.num_classes, img_path),
                                  steps_per_epoch=train_samples // args.batch_size,
                                  epochs=args.epochs,
                                  # callbacks=[onetenth_4_8_12(lr)],
                                  validation_data=generator_val_batch(args.test_file,
                                                                      args.batch_size, args.num_classes, img_path),
                                  validation_steps=val_samples // args.batch_size,
                                  verbose=1)
    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)
    plot_history(history, args.save_path)
    save_history(history, args.save_path)
    # Save model
    model.save_weights(args.save_path + '/weights.h5')

    # Save optimazer state
    if args.save_opt_train == 1:
        symbolic_weights = getattr(model.optimizer, 'weights')
        weight_values = K.batch_get_value(symbolic_weights)
        with open(args.save_path + '/optimizer.pkl', 'wb') as f:
            pickle.dump(weight_values, f)

    with open(args.save_path + '/settings.txt', 'w') as outfile:
        outfile.write('num_classes:\t' + str(args.num_classes) + '\n')
        outfile.write('batch_size:\t' + str(args.batch_size) + '\n')
        outfile.write('epochs:\t' + str(args.epochs) + '\n')
        outfile.write('img_path:\t' + str(img_path) + '\n')
        outfile.write('train_file:\t' + str(args.train_file) + '\n')
        outfile.write('test_file:\t' + str(args.test_file) + '\n')
        outfile.write('lr:\t' + str(lr) + '\n')

    return model, generator_train_batch, generator_val_batch, generator_test_batch
Example #11
0
        elif o in ("-t", "--tag"):
            settings['tag'] = a
        else:
            assert False, "unhandled option"

    model_eval = None
    try:
        model_eval = eval("Model." + model.upper())
    except AttributeError:
        print "You have selected a model that doesn't exist. Defaulting to RANDOM"
        model_eval = Model.RANDOM

    nova = client.Client(username, api_key, project_id, auth_url)

    all_servers = nova.servers.list()
    servers = [x for x in all_servers if x.status == "ACTIVE"]
    if 'tag' in settings:
        servers = [x for x in servers if settings['tag'] in x.metadata and x.metadata[settings['tag']] == '1']

    if len(servers) == 0:
        print "No servers found. Exiting now..."
        sys.exit()

    FailureModel = get_model(model_eval)
    m = FailureModel(nova, servers, settings)
    m.anarchy()
    

if __name__ == "__main__":
    main()
Example #12
0

print('==> Preparing data..')
data = ds[args.dataset]
meta = dsmeta[args.dataset]
classes, nc, size = meta['classes'], meta['nc'], meta['size']

trainset, valset, testset = data(args)

# Toxic comments uses its own data loaders
trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) if (trainset is not None) and (args.dataset not in nlp_data) else trainset
valloader = torch.utils.data.DataLoader(valset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) if (valset is not None) and (args.dataset not in nlp_data) else valset
testloader = torch.utils.data.DataLoader(testset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) if (args.dataset not in nlp_data) else testset

print('==> Building model..')
net = get_model(args, classes, nc)
net = nn.DataParallel(net) if args.parallel else net
optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-4)

if (args.dataset in nlp_data) or ('modelnet' in args.dataset):
    optimizer = optim.Adam(net.parameters(), lr=args.lr)


print('==> Setting up callbacks..')
current_time = datetime.now().strftime('%b%d_%H-%M-%S') + "-run-" + str(args.run_id)
tboard = TensorBoard(write_graph=False, comment=current_time, log_dir=args.log_dir)
tboardtext = TensorBoardText(write_epoch_metrics=False, comment=current_time, log_dir=args.log_dir)


@torchbearer.callbacks.on_start
def write_params(_):
def main():
  
    #读入数据,并存为列表,每一个item为列表中的一个元素
    train = pd.read_csv(args.folds_csv)
    #设置的模型保存路径
    MODEL_PATH = os.path.join(args.models_dir, args.network + args.alias)
    #将数据分折,原始输入数据就是预先分好折的,这个是存储的分折信息,12345列表元素
    folds = [int(f) for f in args.fold.split(',')]

    print('Training Model:', args.network + args.alias)

    for fold in folds:

        K.clear_session()
        print('***************************** FOLD {} *****************************'.format(fold))

        if fold == 0:
            if os.path.isdir(MODEL_PATH):
                raise ValueError('Such Model already exists')
            os.system("mkdir {}".format(MODEL_PATH))

        # Train/Validation sampling
        df_train = train[train.fold != fold].copy().reset_index(drop=True)
        df_valid = train[train.fold == fold].copy().reset_index(drop=True)

        # Train on pseudolabels only
        if args.pseudolabels_dir != '':
            pseudolabels = pd.read_csv(args.pseudolabels_csv)
            df_train = pseudolabels.sample(frac=1, random_state=13).reset_index(drop=True)

        # Keep only non-black images
        ids_train, ids_valid = df_train[df_train.unique_pixels > 1].id.values, df_valid[
            df_valid.unique_pixels > 1].id.values

        print('Training on {} samples'.format(ids_train.shape[0]))
        print('Validating on {} samples'.format(ids_valid.shape[0]))

        # Initialize model
        weights_path = os.path.join(MODEL_PATH, 'fold_{fold}.hdf5'.format(fold=fold))

        # Get the model
        model, preprocess = get_model(args.network,
                                      input_shape=(args.input_size, args.input_size, 3),
                                      freeze_encoder=args.freeze_encoder)

        # LB metric threshold
        def lb_metric(y_true, y_pred):
            return Kaggle_IoU_Precision(y_true, y_pred, threshold=0 if args.loss_function == 'lovasz' else 0.5)

        model.compile(optimizer=RMSprop(lr=args.learning_rate), loss=make_loss(args.loss_function),
                      metrics=[lb_metric])

        if args.pretrain_weights is None:
            print('No weights passed, training from scratch')
        else:
            wp = args.pretrain_weights.format(fold)
            print('Loading weights from {}'.format(wp))
            model.load_weights(wp, by_name=True)

        # Get augmentations
        augs = get_augmentations(args.augmentation_name, p=args.augmentation_prob)

        # Data generator
        dg = SegmentationDataGenerator(input_shape=(args.input_size, args.input_size),
                                       batch_size=args.batch_size,
                                       augs=augs,
                                       preprocess=preprocess)

        train_generator = dg.train_batch_generator(ids_train)
        validation_generator = dg.evaluation_batch_generator(ids_valid)

        # Get callbacks
        callbacks = get_callback(args.callback,
                                 weights_path=weights_path,
                                 fold=fold)

        # Fit the model with Generators:
        model.fit_generator(generator=ThreadsafeIter(train_generator),
                            steps_per_epoch=ids_train.shape[0] // args.batch_size * 2,
                            epochs=args.epochs,
                            callbacks=callbacks,
                            validation_data=ThreadsafeIter(validation_generator),
                            validation_steps=np.ceil(ids_valid.shape[0] / args.batch_size),
                            workers=args.num_workers)

        gc.collect()
                                   OneToThreeDimension(),
                                   ToTensor(),
                                   Normalize(mean=[0.485, 0.456, 0.406],
                                             std=[0.229, 0.224, 0.225]),
                                  ])
    acdc_dataset = {x: ACDCDataset(train_args["pos_samps_"+x],
                                  train_args["neg_samps_"+x],
                                  transform=composed)
                   for x in ["train", "val", "test"]}

    dataloader = {x: DataLoader(acdc_dataset[x],
                                batch_size=train_args["batch_size"],
                                shuffle=True, num_workers=4,
                                # sampler=sampler[x]
                                )
                  for x in ["train", "val", "test"]}
    dataset_sizes = {x: len(acdc_dataset[x]) for x in ["train", "val", "test"]}

    model_ft = get_model(train_args["model"], device,
                         pretrained=train_args["pretrained"])

    criterion = get_loss(train_args["loss_name"])

    optimizer_ft = optim.Adam(model_ft.parameters(), lr=1e-5)

    model_ft = train(model_ft, criterion, optimizer_ft,
                     num_epochs=train_args["epoch"])

    test(model_ft, dataloader["test"], dataset_sizes["test"])

Example #15
0
 def _init_params(self):
     self.netG, netD = get_nets(self.config['model'])
     self.netG.cuda()
     self.model = get_model(self.config['model'])
Example #16
0
def train(rank, args, shared_model, optimizer, env_conf):
    ptitle('Train {0}: {1}'.format(args.env, rank))
    print('Start training agent: ', rank)

    if rank == 0:
        logger = Logger(args.log_dir + '_losses/')
        train_step = 0

    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    env_conf["env_gpu"] = gpu_id
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)

    env = database_env(env_conf, seed=0)

    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)

    player = Agent(None, env, args, None, gpu_id)
    player.gpu_id = gpu_id
    player.model = get_model(args,
                             args.model,
                             env_conf["observation_shape"],
                             args.features,
                             env_conf["num_actions"],
                             gpu_id=0,
                             lstm_feats=args.lstm_feats)
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()

    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()

    if rank == 0:
        eps_reward = 0
        pinned_eps_reward = 0

    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())

        if player.done:
            player.eps_len = 0

            if rank == 0:
                if train_step % args.train_log_period == 0 and train_step > 0:
                    print("train: step", train_step, "\teps_reward",
                          eps_reward)
                if train_step > 0:
                    pinned_eps_reward = player.env.sum_reward
                    eps_reward = 0

            if args.lstm_feats:
                player.cx, player.hx = init_linear_lstm(
                    args.lstm_feats, gpu_id)

        elif args.lstm_feats:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):
            player.action_train()
            if rank == 0:
                eps_reward = player.env.sum_reward
            if player.done:
                break

        if player.done:
            if rank == 0:
                if train_step % args.train_log_period == 0 and train_step > 0:
                    print("train: step", train_step, "\teps_reward",
                          eps_reward)
                    # print ("rewards: ", player.env.rewards)
                    # print ("actions: ", player.actions)

        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1, 1, 1)

        if not player.done:
            if args.lstm_feats:
                value, _, _ = player.model(
                    (Variable(player.state.unsqueeze(0)), (player.hx,
                                                           player.cx)))
            else:
                value, _ = player.model(Variable(player.state.unsqueeze(0)))
            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0

        gae = torch.zeros(1, 1, 1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)

        for i in reversed(range(len(player.rewards))):
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    reward_i = torch.tensor(player.rewards[i]).cuda()
            else:
                reward_i = torch.tensor(player.rewards[i])

            R = args.gamma * R + reward_i
            advantage = R - player.values[i]
            value_loss = value_loss + (0.5 * advantage * advantage).mean()
            delta_t = player.values[
                i + 1].data * args.gamma + reward_i - player.values[i].data
            gae = gae * args.gamma * args.tau + delta_t
            policy_loss = policy_loss - \
                    (player.log_probs[i] * Variable(gae)).mean () - \
                    (args.entropy_alpha * player.entropies[i]).mean ()

        player.model.zero_grad()
        sum_loss = (policy_loss + value_loss)

        sum_loss.backward()
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()

        if rank == 0:
            train_step += 1
            if train_step % args.log_period == 0 and train_step > 0:
                log_info = {
                    'sum_loss': sum_loss,
                    'value_loss': value_loss,
                    'policy_loss': policy_loss,
                    'advanage': advantage,
                    'train eps reward': pinned_eps_reward,
                }

                for tag, value in log_info.items():
                    logger.scalar_summary(tag, value, train_step)
Example #17
0
        os.environ["CUDA_VISIBLE_DEVICES"] = "0"

    set_global_seed(args.seed)
    prepare_cudnn(deterministic=True)

    sub_name = f'Model_{args.task}_{args.model_type}_{args.encoder}_bs_{args.bs}_{str(datetime.datetime.now().date())}'
    logdir = f"./logs/{sub_name}" if args.logdir is None else args.logdir

    preprocessing_fn = smp.encoders.get_preprocessing_fn(args.encoder, args.encoder_weights)
    loaders = prepare_loaders(path=args.path, bs=args.bs,
                              num_workers=args.num_workers, preprocessing_fn=preprocessing_fn, preload=args.preload,
                              image_size=(args.height, args.width), augmentation=args.augmentation, task=args.task)
    test_loader = loaders['test']
    del loaders['test']

    model = get_model(model_type=args.segm_type, encoder=args.encoder, encoder_weights=args.encoder_weights,
                      activation=None, task=args.task)

    optimizer = get_optimizer(optimizer=args.optimizer, lookahead=args.lookahead, model=model,
                              separate_decoder=args.separate_decoder, lr=args.lr, lr_e=args.lr_e)

    if args.scheduler == 'ReduceLROnPlateau':
        scheduler = ReduceLROnPlateau(optimizer, factor=0.6, patience=3)
    else:
        scheduler = ReduceLROnPlateau(optimizer, factor=0.3, patience=3)

    if args.loss == 'BCEDiceLoss':
        criterion = smp.utils.losses.BCEDiceLoss(eps=1.)
    elif args.loss == 'BCEJaccardLoss':
        criterion = smp.utils.losses.BCEJaccardLoss(eps=1.)
    elif args.loss == 'FocalLoss':
        criterion = FocalLoss()
Example #18
0
def test(args, shared_model, env_conf):
    ptitle('Valid agent')

    if args.valid_gpu < 0:
        gpu_id = args.gpu_ids[-1]
    else:
        gpu_id = args.valid_gpu
    env_conf["env_gpu"] = gpu_id

    log = {}
    logger = Logger(args.log_dir)

    create_dir(args.log_dir + "models/")

    os.system("cp *.sh " + args.log_dir)
    os.system("cp *.py " + args.log_dir)
    os.system("cp models/models.py " + args.log_dir + "models/")
    os.system("cp models/basic_modules.py " + args.log_dir + "models/")

    setup_logger('{}_log'.format(args.env),
                 r'{0}{1}_log'.format(args.log_dir, args.env))
    log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format(
        args.env))
    d_args = vars(args)
    env_conf_log = env_conf

    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))
    for k in env_conf_log.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(
            k, env_conf_log[k]))

    torch.manual_seed(args.seed)

    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed)

    env = database_env(env_conf, seed=0, dstype="test")
    env.max_step = 900

    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0

    player = Agent(None, env, args, None, gpu_id)
    player.gpu_id = gpu_id

    player.model = get_model(args,
                             args.model,
                             env_conf["observation_shape"],
                             args.features,
                             env_conf["num_actions"],
                             gpu_id=0,
                             lstm_feats=args.lstm_feats)

    with torch.cuda.device(gpu_id):
        player.model = player.model.cuda()

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()

    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
    player.model.eval()

    flag = True
    create_dir(args.save_model_dir)

    recent_episode_scores = ScalaTracker(100)
    max_score = 0

    while True:
        if flag:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.model.load_state_dict(shared_model.state_dict())
            else:
                player.model.load_state_dict(shared_model.state_dict())
            player.model.eval()
            flag = False

        player.action_test()

        reward_sum += player.reward.mean()

        if player.done:
            flag = True
            num_tests += 1

            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests

            log['{}_log'.format(args.env)].info(
                "VALID: Time {0}, episode reward {1}, num tests {4}, episode length {2}, reward mean {3:.4f}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean, num_tests))

            recent_episode_scores.push(reward_sum)

            if args.save_max and recent_episode_scores.mean() >= max_score:
                max_score = recent_episode_scores.mean()
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        state_to_save = {}
                        state_to_save = player.model.state_dict()
                        torch.save(
                            state_to_save,
                            '{0}{1}.dat'.format(args.save_model_dir,
                                                'best_model_' + args.env))

            if num_tests % args.save_period == 0:
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        state_to_save = player.model.state_dict()
                        torch.save(
                            state_to_save, '{0}{1}.dat'.format(
                                args.save_model_dir,
                                args.env + '_' + str(num_tests)))

            if num_tests % args.log_period == 0:
                print("------------------------------------------------")
                print(args.env)
                print("Log test #:", num_tests)
                print("sum rewards: ", player.env.sum_reward)
                print("action_history\n", player.env.action_his)
                print()
                print("------------------------------------------------")

                log_info = {
                    'mean_reward': reward_mean,
                    '100_mean_reward': recent_episode_scores.mean()
                }
                for tag, value in log_info.items():
                    logger.scalar_summary(tag, value, num_tests)

            reward_sum = 0
            player.eps_len = 0

            player.clear_actions()
            state = player.env.reset()

            time.sleep(15)

            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
Example #19
0

# In[8]:


def sig_iou_score(y_true, y_pred):
    return iou_score(y_true,tf.math.sigmoid(y_pred)) # tf.math.sigmoid(y_pred)

def sigm_binary_accuracy(y_true, y_pred):
    return binary_accuracy(y_true, tf.math.sigmoid(y_pred))  #tf.math.sigmoid(y_pred)


loss_function ='seloss'

# Get the model
model = get_model(network = 'unet_resnext_50_margo',input_shape=(512, 512, 3),
                                      freeze_encoder=False)
Adam_opt =Adam(lr=0.00005,  beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)#, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.001, amsgrad=True)


# In[ ]:



loss_history = []
weight_path = "/data/margokat/models_saved/inria/{}_weights.best.hdf5".format('resnext50_unet_margo_se')
checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1,
                             save_best_only=True, mode='min', save_weights_only=True)

reduceLROnPlat = ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=2, verbose=1, mode='auto',
                                   epsilon=0.0001, cooldown=4, min_lr=0.0001)
early = EarlyStopping(monitor="val_loss",
Example #20
0
if __name__ == '__main__':
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    if args.gpu_ids == -1:
        args.gpu_ids = [-1]
    else:
        torch.cuda.manual_seed(args.seed)
        mp.set_start_method('spawn')

    env_conf = setup_env_conf(args)

    shared_model = get_model(args,
                             args.model,
                             env_conf["observation_shape"],
                             args.features,
                             env_conf["num_actions"],
                             gpu_id=-1,
                             lstm_feats=args.lstm_feats)

    if args.load:
        saved_state = torch.load(args.load,
                                 map_location=lambda storage, loc: storage)
        shared_model.load_state_dict(saved_state)
    shared_model.share_memory()

    if args.shared_optimizer:
        if args.optimizer == 'RMSprop':
            optimizer = SharedRMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = SharedAdam(shared_model.parameters(),
Example #21
0
def main(model='resnet18',
         rep_dim=490,
         dataset='curated',
         base_path=None,
         unzip=False,
         ae_train=True,
         clf_train=True,
         ae_epochs=100,
         clf_epochs=100,
         batch_size=4,
         accumulation_steps=32,
         ae_loadfile=None,
         clf_loadfile=None,
         save_model=True,
         ae_test=True,
         accumulate=False):
    '''
    model : CNN architecture to use ['LeNet', 'VGG', ...]
    data : 'curated' or 'full'
    base_path : path/to/ChestXRay eg. /home/paperspace/ChestXRay
    '''
    if base_path is None:
        raise ValueError('Please point base_path to ChestXRay/')

    if ae_train and (ae_loadfile or clf_loadfile):
        raise ValueError(
            'Please either set ae_train to True or specify a loadfile but not both.'
        )

    filename = setup_logging(base_path=base_path, model=model, rep_dim=rep_dim)
    logger = logging.getLogger()
    logging.info('Architecture : {}'.format(model))
    logging.info('Representaion Dimensionality : {}'.format(rep_dim))
    logging.info('Dataset : {}'.format(dataset))

    if unzip:
        unzip_data(base_path)

    trainloader = get_dataloader(dataset=dataset,
                                 set_='train',
                                 batch_size=batch_size)
    testloader = get_dataloader(dataset=dataset,
                                set_='test',
                                batch_size=batch_size)

    #autoencoder = resnet18(num_classes=490, autoencoder=True)
    autoencoder = get_model(model=model, kind='autoencoder', rep_dim=rep_dim)
    if ae_loadfile is not None:
        ae_load_path = os.path.join(base_path,
                                    'models/saved_models/') + ae_loadfile
        autoencoder.load_state_dict(torch.load(ae_load_path), strict=False)
    if ae_train:
        autoencoder = pretrain(trainloader=trainloader,
                               autoencoder=autoencoder,
                               ae_epochs=ae_epochs,
                               accumulation_steps=accumulation_steps,
                               accumulate=accumulate)

        if save_model:
            save_path = os.path.join(
                base_path, 'models/saved_models/') + 'ae: ' + filename + '.pt'
            torch.save(autoencoder.state_dict(), save_path)

    if ae_test:
        pretest(testloader=testloader, autoencoder=autoencoder)
    del autoencoder

    classifier = get_model(model=model, kind='classifier', rep_dim=rep_dim)
    classifier.load_state_dict(torch.load(save_path), strict=False)
    if clf_loadfile is not None:
        clf_load_path = os.path.join(base_path,
                                     'models/saved_models/') + clf_loadfile
        classifier.load_state_dict(torch.load(clf_load_path), strict=False)

    c = find_center(trainloader=trainloader,
                    classifier=classifier,
                    rep_dim=rep_dim)

    if clf_train:
        classifier = train(trainloader=trainloader,
                           classifier=classifier,
                           clf_epochs=clf_epochs,
                           accumulation_steps=accumulation_steps,
                           c=c,
                           accumulate=accumulate)

        if save_model:
            save_path = os.path.join(
                base_path, 'models/saved_models/') + 'clf: ' + filename + '.pt'
            torch.save(classifier.state_dict(), save_path)

    test(testloader=testloader, classifier=classifier, c=c)
    return
Example #22
0
def main(args):
    time_stamp = '{:.0f}'.format(time.time() % 100000)
    if torch.cuda.is_available() is True:
        logging.info('Utilizing GPU', extra=args.client)
        print('Utilizing GPU')

    train_loader, val_loader = load_data(args)
    model = get_model(args.model, args)

    if args.batch_size > 256 and args.dataset == 'imagenet' and args.model == 'resnet':
        batch_accumulate_num = args.batch_size // 256
    else:
        batch_accumulate_num = 1
    # create model
    if args.dataset == 'imagenet':
        if batch_accumulate_num > 1:
            args.iterations_per_epoch = len(train_loader.dataset.imgs) // 256
        else:
            args.iterations_per_epoch = len(
                train_loader.dataset.imgs) // args.batch_size
        val_len = len(val_loader.dataset.imgs) // 1024
    else:
        args.iterations_per_epoch = len(
            train_loader.dataset.train_labels) // args.batch_size
        val_len = len(val_loader.dataset.test_labels) // 1024

    # get the number of model parameters
    log_str = 'Number of model parameters: {}'.format(
        sum([p.data.nelement() for p in model.parameters()]))
    logging.info(log_str, extra=args.client)
    print(log_str)
    # for training on multiple GPUs.
    model = torch.nn.DataParallel(model)
    model = model.cuda()
    server = ParameterServer.get_server(args.optimizer, model, args)
    val_statistics = Statistics.get_statistics('image_classification', args)
    train_statistics = Statistics.get_statistics('image_classification', args)
    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume + '/checkpoint.pth.tar'):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume + '/checkpoint.pth.tar')
            args.start_epoch = checkpoint['epoch']
            server = checkpoint['server']
            val_statistics = checkpoint['val_stats']
            train_statistics = checkpoint['train_stats']
            model.load_state_dict(checkpoint['state_dict'])
            print('=> loaded checkpoint {} (epoch {})'.format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # # Synchronous to Asynchronous Adjustments
    # print('Resetting Parameter Server to Asynchronous Mode')
    # logging.info('Resetting Parameter Server to Asynchronous Mode', extra=args.client)
    # server._shards_weights = list()
    # weights = server._get_model_weights()
    # for i in range(0, args.workers_num):
    #     server._shards_weights.append(deepcopy(weights))
    # server._workers_num = args.workers_num
    # # learning rate initialization
    # batch_baseline = args.baseline
    # server._lr = args.lr * np.sqrt((args.workers_num * args.batch_size) // batch_baseline) / (args.workers_num)
    # server._fast_im = args.fast_im
    # server._lr_warm_up = args.lr_warm_up
    # server._current_lr = args.lr
    # server._m_off = args.m_off
    # server._current_momentum = args.momentum
    # server._iterations_per_epoch = args.iterations_per_epoch
    # server._momentum = args.momentum
    # server._client = args.client
    # if args.fast_im is True:
    #     end_lr = args.lr * ((args.workers_num * args.batch_size) // batch_baseline) / np.sqrt(args.workers_num)
    #     start_lr = args.lr / (args.workers_num)
    #     server._lr = end_lr
    #     server._start_lr = start_lr
    #     server._lr_increment_const = (end_lr - start_lr) / (args.iterations_per_epoch * 5)
    #     log_str = 'Fast ImageNet Mode - Warm Up [{:.5f}]->[{:.5f}] In 5 Epochs'.format(start_lr, end_lr)
    #     logging.info(log_str, extra=args.client)
    #     print(log_str)
    # else:
    #     server._start_lr = 0
    #     server._lr_increment_const = 0
    # for param_group in server._optimizer.param_groups:
    #     param_group['lr'] = start_lr
    #     param_group['momentum'] = server._momentum
    # # Synchronous to Asynchronous Adjustments - End

    cudnn.benchmark = True
    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    if args.bar is True:
        train_bar = IncrementalBar('Training  ',
                                   max=args.iterations_per_epoch,
                                   suffix='%(percent)d%%')
        val_bar = IncrementalBar('Evaluating',
                                 max=val_len,
                                 suffix='%(percent)d%%')
    else:
        train_bar = None
        val_bar = None
    log_str = '{}: Training neural network for {} epochs with {} workers'.format(
        args.id, args.epochs, args.workers_num)
    logging.info(log_str, extra=args.client)
    print(log_str)
    train_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        # train for one epoch
        train_loss, train_error = train(train_loader, model, criterion, server,
                                        epoch, args.workers_num,
                                        args.grad_clip, batch_accumulate_num,
                                        train_bar, train_statistics,
                                        args.client)

        train_time = time.time() - train_time
        if args.bar is True:
            train_bar.finish()
            train_bar.index = 0

        # evaluate on validation set
        val_time = time.time()
        with torch.no_grad():
            val_loss, val_error = validate(val_loader, model, criterion,
                                           server, val_statistics, val_bar)
        train_statistics.save_loss(train_loss)
        train_statistics.save_error(train_error)
        train_statistics.save_weight_mean_dist(
            server.get_workers_mean_statistics())
        train_statistics.save_weight_master_dist(
            server.get_workers_master_statistics())
        train_statistics.save_mean_master_dist(server.get_mean_master_dist())
        train_statistics.save_weight_norm(server.get_server_weights())
        train_statistics.save_gradient_norm(server.get_server_gradients())
        val_time = time.time() - val_time
        if args.bar is True:
            val_bar.finish()
            val_bar.index = 0

        log_str = 'Epoch [{0:1d}]: Train: Time [{1:.2f}], Loss [{2:.3f}], Error[{3:.3f}] | ' \
                  'Test: Time [{4:.2f}], Loss [{5:.3f}], Error[{6:.3f}]'.format(epoch + 1, train_time, train_loss,
                                                                                train_error, val_time, val_loss,
                                                                                val_error)
        logging.info(log_str, extra=args.client)
        print(log_str)
        if epoch % args.save == 0 and epoch > 0:
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'val_stats': val_statistics,
                    'train_stats': train_statistics,
                    'server': server
                },
                sim_name=(args.name + time_stamp + '_' + str(epoch)))
        train_time = time.time()

    return train_statistics, val_statistics
Example #23
0
    t_initial_vertex_features = tf.placeholder(dtype=tf.float32,
                                               shape=[None, 1])
t_vertex_coord_list = [tf.placeholder(dtype=tf.float32, shape=[None, 3])]
for _ in range(len(config['runtime_graph_gen_kwargs']['level_configs'])):
    t_vertex_coord_list.append(
        tf.placeholder(dtype=tf.float32, shape=[None, 3]))
t_edges_list = []
for _ in range(len(config['runtime_graph_gen_kwargs']['level_configs'])):
    t_edges_list.append(tf.placeholder(dtype=tf.int32, shape=[None, 2]))
t_keypoint_indices_list = []
for _ in range(len(config['runtime_graph_gen_kwargs']['level_configs'])):
    t_keypoint_indices_list.append(
        tf.placeholder(dtype=tf.int32, shape=[None, 1]))
t_is_training = tf.placeholder(dtype=tf.bool, shape=[])
model = get_model(config['model_name'])(num_classes=NUM_CLASSES,
                                        box_encoding_len=BOX_ENCODING_LEN,
                                        mode='test',
                                        **config['model_kwargs'])
t_logits, t_pred_box = model.predict(t_initial_vertex_features,
                                     t_vertex_coord_list,
                                     t_keypoint_indices_list, t_edges_list,
                                     t_is_training)
t_probs = model.postprocess(t_logits)
t_predictions = tf.argmax(t_probs, axis=1, output_type=tf.int32)
# optimizers ==================================================================
global_step = tf.Variable(0, dtype=tf.int32, trainable=False)
fetches = {
    'step': global_step,
    'predictions': t_predictions,
    'probs': t_probs,
    'pred_box': t_pred_box
}
def get_models(img_rows, img_cols, fold):
    model, process = get_model(network=network, input_shape=(img_rows, img_cols, 1),
                               freeze_encoder=False)
    model.load_weights(weights_path + str(fold) + '.hdf5')
    model.compile(optimizer=Adam(lr=learning_rate), loss=make_loss(loss_name=loss_function), metrics=[dice_coef])
    return model, process