Ejemplo n.º 1
0
def fill_missing_bLSTM(panel, epochs=100):
    non_nans = 1 - np.isnan(panel.values)
    X_train = panel.fillna(0).values
    n_samples, n_timesteps, n_feat = X_train.shape
    main_input = Input(shape=(n_timesteps, n_feat), name='main_input')
    lstm = Bidirectional(LSTM(120, return_sequences=True))(main_input)
    unmasked_outputs = TimeDistributed(Dense(n_feat))(lstm)
    bool_input = Input(shape=(n_timesteps, n_feat), name='isnan_inputs')
    masked_outputs = merge([unmasked_outputs, bool_input], mode='mul')
    model = Model(input=[main_input, bool_input], output=masked_outputs)
    print model.count_params()
    unmasked_model = Model(input=main_input, output=unmasked_outputs)
    model.compile(optimizer='rmsprop', loss='mse')
    early_stopping = EarlyStopping(patience=20)
    history = model.fit([X_train, non_nans],
                        X_train,
                        nb_epoch=epochs,
                        validation_split=0.1,
                        callbacks=[early_stopping])
    plot_loss(history)
    unmasked_model.compile(optimizer='rmsprop', loss='mae')
    X_train = unmasked_model.predict(X_train)
    fpanel = pd.Panel(data=X_train,
                      items=panel.axes[0],
                      major_axis=panel.axes[1],
                      minor_axis=panel.axes[2])
    return fpanel
Ejemplo n.º 2
0
def iterative_fill_bLSTM(panel, epochs=5000, iterations=5):
    nans = np.isnan(panel.values) + 0
    ians = 1 - nans
    X_train = panel.fillna(0).values
    for iteration in range(iterations):
        n_samples, n_timesteps, n_feat = X_train.shape
        main_input = Input(shape=(n_timesteps, n_feat), name='main_input')
        lstm = Bidirectional(
            LSTM(120, dropout_W=0.5, dropout_U=0.2,
                 return_sequences=True))(main_input)
        # lstm2 = Bidirectional(LSTM(60, dropout_W = 0.5, dropout_U = 0.2, return_sequences=True))(lstm)
        unmasked_outputs = TimeDistributed(Dense(n_feat))(lstm)
        nan_input = Input(shape=(n_timesteps, n_feat), name='is nan inputs')
        ian_input = Input(shape=(n_timesteps, n_feat), name='is a num inputs')
        masked_outputs = merge([unmasked_outputs, ian_input], mode='mul')
        only_original_outputs = merge([main_input, nan_input], mode='mul')
        final_output = merge([masked_outputs, only_original_outputs],
                             mode='sum')
        model = Model(input=[main_input, ian_input, nan_input],
                      output=final_output)
        model.compile(optimizer='rmsprop', loss='mse')
        early_stopping = EarlyStopping(patience=20)
        history = model.fit([X_train, ians, nans],
                            X_train,
                            nb_epoch=epochs,
                            validation_split=0.1,
                            callbacks=[early_stopping])
        plot_loss(history)

        unmasked_model = Model(input=main_input, output=unmasked_outputs)
        unmasked_model.compile(optimizer='rmsprop', loss='mse')
        X_train = unmasked_model.predict(X_train)
    fpanel = pd.Panel(data=X_train,
                      items=panel.axes[0],
                      major_axis=panel.axes[1],
                      minor_axis=panel.axes[2])
    return fpanel
Ejemplo n.º 3
0
def train_vm(data_loader, val_loader, vm, fp, device, lr, weight_decay, iters, epochs=1,
             val_report_iter=50, model_save_path=None, loss_visualize=False, loss_name="dice", fg_thresh=0.5):
    # set model to train mode
    vm.feat_net.train()

    weight_num = sum(p.numel() for p in vm.feat_net.parameters() if p.requires_grad)
    logger.debug("Number of trainable parameters in VideoMatch: {}".format(weight_num))

    optimizer = optim.Adam(vm.feat_net.parameters(), lr=lr, weight_decay=weight_decay)

    stop_training = False

    # save model on SIGINT (Ctrl + c)
    def sigint_handler(signal, frame):
        logger.info("Ctrl+c caught, stopping the training and saving the model...")
        nonlocal stop_training
        stop_training = True

    signal.signal(signal.SIGINT, sigint_handler)

    logger.debug("Using foreground threshold {}".format(fg_thresh))
    logger.debug("Running untrained VideoMatch on validation set...")

    # check videomatch avg val accuracy
    vm_avg_val_score = 0.
    with torch.set_grad_enabled(False):
        for val_ref_frame, val_test_frame in tqdm(val_loader):
            # todo: waiting for optimization, it takes too long, about 12min
            (ref_img, ref_mask), (test_img, test_mask) = fp(val_ref_frame, val_test_frame)

            vm.seq_init(ref_img, ref_mask)
            fg_prob, _ = vm.predict_fg_bg(test_img)
            # vm_avg_val_score += segmentation_accuracy(fg_prob, test_mask.to(device))
            vm_avg_val_score += segmentation_IOU(fg_prob.cpu(), test_mask, fg_thresh)

    logger.debug("Untrained Videomatch IOU on validation set: {:.3f}".format(vm_avg_val_score / len(val_loader)))

    if loss_name == "dice":
        loss_function = dice_loss
    elif loss_name == "bce":
        loss_function = torch.nn.BCELoss()
    elif loss_name == "balancedbce":
        loss_function = balanced_CE_loss
    else:
        raise ValueError("Loss function {} is uknown, use 'dice', 'bce' or 'balancedbce'!".format(loss_name))

    logger.debug("Using loss function {}".format(loss_name))
    logger.debug("Training started...")

    loss_list = []
    val_score_list = []
    for epoch in range(epochs):
        logger.debug("Epoch: \t[{}/{}]".format(epoch + 1, epochs))

        avg_loss = 0.
        for i, (ref_frame, test_frame) in tqdm(enumerate(data_loader)):
            if i >= iters or stop_training:
                break

            # preprocess
            (ref_img, ref_mask), (test_img, test_mask) = fp(ref_frame, test_frame)
            test_mask = test_mask.unsqueeze(0).to(device).float()

            # initialize every time since reference image keeps changing
            vm.seq_init(ref_img, ref_mask)

            # Use softmaxed foreground probability and groundtruth to compute BCE loss
            fg_prob, _ = vm.predict_fg_bg(test_img)

            loss = loss_function(fg_prob, test_mask)
            avg_loss += loss.data.mean().cpu().numpy()

            if ((i + 1) % val_report_iter == 0 or i + 1 == iters) and i > 0:
                vm_avg_val_score = 0.
                val_cnt = 0
                with torch.set_grad_enabled(False):
                    for val_ref_frame, val_test_frame in val_loader:
                        (ref_img, ref_mask), (test_img, test_mask) = fp(val_ref_frame, val_test_frame)

                        vm.seq_init(ref_img, ref_mask)
                        fg_prob, _ = vm.predict_fg_bg(test_img)
                        vm_avg_val_score += segmentation_IOU(fg_prob.cpu(), test_mask, fg_thresh)
                        val_cnt += 1

                logger.debug("Iter [{:5d}/{}]:\tavg loss = {:.4f},\tavg val IOU = {:.3f}"
                             .format(i + 1, iters, avg_loss / val_report_iter, vm_avg_val_score / val_cnt))

                val_score_list.append(vm_avg_val_score / val_cnt)
                loss_list.append(avg_loss / val_report_iter)
                avg_loss = 0.

            # backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if stop_training:
            break

    if model_save_path is not None:
        logger.info("Saving model to path {}".format(model_save_path))
        vm.save_model(model_save_path)

    if loss_visualize:
        if not loss_list:
            logger.info("Loss list is empty, omitting loss visualization!")
        else:
            bins = 0 if len(loss_list) < 500 else 50
            plot_loss(loss_list, val_score_list, val_report_iter, bins=bins)
            plt.show()
Ejemplo n.º 4
0
                                             batch_size=opt.batch_size)
    valid_generator = ImageDataGenerator().flow(x_valid,
                                                y_valid,
                                                batch_size=opt.batch_size)
    model = CNN3()
    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(optimizer=sgd,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    callback = [
        #     EarlyStopping(monitor='val_loss', patience=50, verbose=True),
        #     ReduceLROnPlateau(monitor='lr', factor=0.1, patience=15, verbose=True),
        ModelCheckpoint('../models/cnn3_best_weights.h5',
                        monitor='val_acc',
                        verbose=True,
                        save_best_only=True,
                        save_weights_only=True)
    ]
    history_ck = model.fit_generator(
        train_generator,
        steps_per_epoch=len(y_train) // opt.batch_size,
        epochs=opt.epochs,
        validation_data=valid_generator,
        validation_steps=len(y_valid) // opt.batch_size,
        callbacks=callback)
    his = history_ck

if opt.plot_history:
    plot_loss(his.history, opt.dataset)
    plot_acc(his.history, opt.dataset)
Ejemplo n.º 5
0
def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, critierion, metrics, params, model_dir,
                       restore_file=None):
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)
        
    best_val_acc = 0.0
    best_val_metrics = []
    learning_rate_0 = params.learning_rate
    train_acc_series = []
    val_acc_series = []
    train_loss_series = []
    
    for epoch in range(params.num_epochs):
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))
        
        # train model
        train_metrics = train(model, train_dataloader, optimizer, critierion, metrics, params)
        
        # learning rate exponential decay
        params.learning_rate = learning_rate_0 * np.exp(-params.exp_decay_k * epoch)
        
        # evaluate
        val_metrics = evaluate(model, critierion, val_dataloader, metrics, params)
        
        # find accuracy from validation dataset
        val_acc = val_metrics['accuracy']
        is_best = val_acc >= best_val_acc
        
        # save weights
        utils.save_checkpoint({'epoch': epoch + 1,
                               'state_dict': model.state_dict(),
                               'optim_dict': optimizer.state_dict()},
                              is_best=is_best,
                              checkpoint=model_dir)
        
        # save accuracy / loss to array for plot
        train_acc_series.append(train_metrics['accuracy'])
        val_acc_series.append(val_metrics['accuracy'])
        train_loss_series.append(train_metrics['loss'])
        
        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best accuracy")
            best_val_acc = val_acc
            best_val_metrics = val_metrics

            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(
                model_dir, "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)
        
        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(
            model_dir, "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)
        print('******************************************')
    
    # plot visualized performance
    visualize.plot_train_val_accuracy(train_acc_series, val_acc_series)
    visualize.plot_loss(train_loss_series)
    # save best validation F1 score plot
    visualize.plot_individual_label_f1score(best_val_metrics)
Ejemplo n.º 6
0
    def train_model(self, train_dataset, val_dataset, learning_rate, epochs, layers):
        """Train the model.
        train_dataset, val_dataset: Training and validation Dataset objects.
        learning_rate: The learning rate to train with
        epochs: Number of training epochs. Note that previous training epochs
                are considered to be done alreay, so this actually determines
                the epochs to train in total rather than in this particaular
                call.
        layers: Allows selecting wich layers to train. It can be:
            - A regular expression to match layer names to train
            - One of these predefined values:
              heaads: The RPN, classifier and mask heads of the network
              all: All the layers
              3+: Train Resnet stage 3 and up
              4+: Train Resnet stage 4 and up
              5+: Train Resnet stage 5 and up
        """

        # Pre-defined layer regular expressions
        layer_regex = {
            # all layers but the backbone
            "heads": r"(fpn.P5\_.*)|(fpn.P4\_.*)|(fpn.P3\_.*)|(fpn.P2\_.*)|(rpn.*)|(classifier.*)|(mask.*)",
            # From a specific Resnet stage and up
            "3+": r"(fpn.C3.*)|(fpn.C4.*)|(fpn.C5.*)|(fpn.P5\_.*)|(fpn.P4\_.*)|(fpn.P3\_.*)|(fpn.P2\_.*)|(rpn.*)|(classifier.*)|(mask.*)",
            "4+": r"(fpn.C4.*)|(fpn.C5.*)|(fpn.P5\_.*)|(fpn.P4\_.*)|(fpn.P3\_.*)|(fpn.P2\_.*)|(rpn.*)|(classifier.*)|(mask.*)",
            "5+": r"(fpn.C5.*)|(fpn.P5\_.*)|(fpn.P4\_.*)|(fpn.P3\_.*)|(fpn.P2\_.*)|(rpn.*)|(classifier.*)|(mask.*)",
            # All layers
            "all": ".*",
        }
        if layers in layer_regex.keys():
            layers = layer_regex[layers]

        # Data generators
        train_set = data_generator_layer.Dataset(train_dataset, self.config, augment=True)
        train_generator = torch.utils.data.DataLoader(train_set, batch_size=1, shuffle=True, num_workers=4)
        val_set = data_generator_layer.Dataset(val_dataset, self.config, augment=True)
        val_generator = torch.utils.data.DataLoader(val_set, batch_size=1, shuffle=True, num_workers=4)

        # Train
        utils_log.log("\nStarting at epoch {}. LR={}\n".format(self.epoch+1, learning_rate))
        utils_log.log("Checkpoint Path: {}".format(self.checkpoint_path))
        self.set_trainable(layers)

        # Optimizer object
        # Add L2 Regularization
        # Skip gamma and beta weights of batch normalization layers.
        trainables_wo_bn = [param for name, param in self.named_parameters() if param.requires_grad and not 'bn' in name]
        trainables_only_bn = [param for name, param in self.named_parameters() if param.requires_grad and 'bn' in name]
        optimizer = optim.SGD([
            {'params': trainables_wo_bn, 'weight_decay': self.config.WEIGHT_DECAY},
            {'params': trainables_only_bn}
        ], lr=learning_rate, momentum=self.config.LEARNING_MOMENTUM)

        for epoch in range(self.epoch+1, epochs+1):
            utils_log.log("Epoch {}/{}.".format(epoch,epochs))

            # Training
            loss = self.train_epoch(train_generator, optimizer, self.config.STEPS_PER_EPOCH)

            # Validation
            val_loss = self.valid_epoch(val_generator, self.config.VALIDATION_STEPS)

            # Statistics
            self.loss_history.append(loss)
            self.val_loss_history.append(val_loss)
            visualize.plot_loss(self.loss_history, self.val_loss_history, save=True, log_dir=self.log_dir)

            # Save model
            torch.save(self.state_dict(), self.checkpoint_path.format(epoch))

        self.epoch = epochs
Ejemplo n.º 7
0
def main(reps,
         pretrained_w_path,
         do_module1,
         init_seed=0,
         load_t=0,
         num_epochs=200,
         batchsize=96,
         fine_tune=0,
         patience=500,
         lr_init=1e-3,
         optim='adagrad',
         toy=0,
         num_classes=374):
    res_root = '/home/hoa/Desktop/projects/resources'
    X_path = osp.join(res_root, 'datasets/corel5k/Xaug_train_b01c.npy')
    Y_path = osp.join(res_root, 'datasets/corel5k/Y_train.npy')
    MEAN_IMG_PATH = osp.join(res_root, 'models/ilsvrc_2012_mean.npy')
    snapshot = 50  # save model after every `snapshot` epochs

    drop_p = 0.5  # drop out prob.
    lambda2 = 0.0005 / 2  # l2-regularizer constant
    # step=patience/4 # decay learning after every `step` epochs
    lr_patience = 60  # for learning rate schedule, if optim=='momentum'
    if toy:  # unit testing
        num_epochs = 10
        data_multi = 3
        reps = 2
        #drop_p=0
        #lambda2=0

    # Create name tag for the experiment
    if fine_tune:
        full_or_tune = 'tune'  # description tag for storing associated files
    else:
        full_or_tune = 'full'
    time_stamp = time.strftime("%y%m%d%H%M%S", time.localtime())
    snapshot_root = '../snapshot_models/'

    # LOADING DATA
    print 'LOADING DATA ...'
    X = np.load(X_path)
    Y = np.load(Y_path)
    N = len(Y)

    print 'Raw X,Y shape', X.shape, Y.shape
    if len(X) != len(Y):
        print 'Inconsistent number of input images and labels. X is possibly augmented.'

    MEAN_IMG = np.load(MEAN_IMG_PATH).astype('float32')
    MEAN_IMG_227 = skimage.transform.resize(np.swapaxes(
        np.swapaxes(MEAN_IMG, 0, 1), 1, 2), (227, 227),
                                            mode='nearest',
                                            preserve_range=True)
    MEAN_IMG = np.swapaxes(np.swapaxes(MEAN_IMG_227, 1, 2), 0, 1).reshape(
        (1, 3, 227, 227))

    all_metrics = []  # store metrics in each run
    time_profiles = {
        'train_module1': [],
        'train_module1_eff': [],
        'train_module2': [],
        'test': []
    }  # record training and testing time

    # PREPARE THEANO EXPRESSION FOR BOTH MODULES
    print 'COMPILING THEANO EXPRESSION ...'
    input_var = T.tensor4('inputs')
    target_var = T.imatrix('targets')
    network = build_model(num_classes=num_classes, input_var=input_var)

    # Create a loss expression for training
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.binary_crossentropy(prediction, target_var)
    weights = lasagne.layers.get_all_params(network, regularizable=True)
    l2reg = theano.shared(floatX(lambda2)) * T.sum(
        [T.sum(w**2) for w in weights])
    loss = loss.mean() + l2reg

    lr = theano.shared(np.array(lr_init, dtype=theano.config.floatX))
    lr_decay = np.array(1. / 3, dtype=theano.config.floatX)

    # Create update expressions for training
    params = lasagne.layers.get_all_params(network, trainable=True)
    # last-layer case is actually very simple:
    # `params` above is a list of all (W,b)-pairs
    # Therefore last layer's (W,b) is params[-2:]
    if fine_tune == 7:  # tuning params from fc7 to fc8
        params = params[-2:]
    # elif fine_tune == 6: # tuning params from fc6 to fc8
    #     params = params[-4:]
    # TODO adjust for per-layer training with local_lr

    if optim == 'momentum':
        updates = lasagne.updates.nesterov_momentum(loss,
                                                    params,
                                                    learning_rate=lr,
                                                    momentum=0.9)
    elif optim == 'rmsprop':
        updates = lasagne.updates.rmsprop(loss,
                                          params,
                                          learning_rate=lr,
                                          rho=0.9,
                                          epsilon=1e-06)
    elif optim == 'adam':
        updates = lasagne.updates.adam(loss,
                                       params,
                                       learning_rate=lr,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)
    elif optim == 'adagrad':
        updates = lasagne.updates.adagrad(loss,
                                          params,
                                          learning_rate=lr,
                                          epsilon=1e-06)

    # Create a loss expression for validation/testing
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.binary_crossentropy(test_prediction,
                                                       target_var)
    test_loss = test_loss.mean() + l2reg
    # zero-one loss with threshold t = 0.5 for reference
    # zero_one_loss = T.abs_((test_prediction > theano.shared(floatX(0.5))) - target_var).sum(axis=1)
    #zero_one_loss /= target_var.shape[1].astype(theano.config.floatX)
    #zero_one_loss = zero_one_loss.mean()

    # Compile a function performing a backward pass (training step)  on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    bwd_fn = theano.function(
        [input_var, target_var],
        loss,
        updates=updates,
    )
    # Compile a second function performing a forward pass,
    # returns validation loss, 0/1 Error, score i.e. Xout:
    fwd_fn = theano.function([input_var, target_var], test_loss)

    # Create a theano function for computing score
    score = lasagne.layers.get_output(network, deterministic=True)
    score_fn = theano.function([input_var], score)

    def compute_score(X, Y, batchsize=batchsize, shuffle=False):
        out = np.zeros(Y.shape)
        batch_id = 0
        for batch in iterate_minibatches(X, Y, batchsize, shuffle=False):
            inputs, _ = batch
            # Flip random half of the batch
            flip_idx = np.random.choice(len(inputs),
                                        size=len(inputs) / 2,
                                        replace=False)
            if len(flip_idx) > 1:
                inputs[flip_idx] = inputs[flip_idx, :, :, ::-1]
            # Substract mean image
            inputs = (inputs - MEAN_IMG).astype(theano.config.floatX)
            # MEAN_IMG is broadcasted numpy-way, take note if want theano expression instead
            if len(inputs) == batchsize:
                out[batch_id * batchsize:(batch_id + 1) *
                    batchsize] = score_fn(inputs)
                batch_id += 1
            else:
                out[batch_id * batchsize:] = score_fn(inputs)

        return out

    try:
        #  MAIN LOOP FOR EACH RUN
        for seed in np.arange(reps) + init_seed:
            snapshot_name = str(
                num_classes) + 'alex' + time_stamp + full_or_tune + str(seed)
            # reset learning rate
            lr.set_value(lr_init)

            print '\nRUN', seed, '...'
            # Split train/val/test set
            # indicies = np.arange(len(Y))
            # Y_train_val, Y_test, idx_train_val, idx_test = train_test_split(
            #     Y, indicies, random_state=seed, train_size=float(2)/3)
            idx_train_val = np.arange(len(Y))

            # Module 2 training set is composed of module 1 training and validation set
            idx_aug_train_val = data_aug(idx_train_val,
                                         mode='aug',
                                         isMat='idx',
                                         N=N)
            Xaug_train_val = X
            if Xaug_train_val.shape[1] != 3:
                Xaug_train_val = b01c_to_bc01(Xaug_train_val)
            Yaug_train_val = data_aug(Y, mode='aug', isMat='Y', N=N)

            # train/val/test set for module 1
            Y_train, Y_val, idx_train, idx_val = train_test_split(
                Y, idx_train_val, random_state=seed)

            idx_aug_train = idx_train
            Xaug_train = Xaug_train_val[idx_aug_train]
            Yaug_train = Y_train

            idx_aug_val = idx_val
            Xaug_val = Xaug_train_val[idx_aug_val]
            Yaug_val = Y_val

            # Test set
            X_test = np.load(
                osp.join(res_root, 'datasets/corel5k/Xaug_test_b01c.npy'))
            if X_test.shape[1] != 3:
                X_test = b01c_to_bc01(X_test)
            Y_test = np.load(osp.join(res_root, 'datasets/corel5k/Y_test.npy'))

            print "Augmented train/val/test set size:", len(Xaug_train), len(
                Yaug_val), len(X_test)
            print "Augmented (X,Y) dtype:", Xaug_train.dtype, Yaug_val.dtype
            print "Processed Mean image:", MEAN_IMG.dtype, MEAN_IMG.shape

            if toy:  # try to overfit a tiny subset of the data
                Xaug_train = Xaug_train[:batchsize * data_multi +
                                        batchsize / 2]
                Yaug_train = Yaug_train[:batchsize * data_multi +
                                        batchsize / 2]
                Xaug_val = Xaug_val[:batchsize + batchsize / 2]
                Yaug_val = Yaug_val[:batchsize + batchsize / 2]

            # Init by pre-trained weights, if any
            if len(pretrained_w_path) > 0:
                layer_list = lasagne.layers.get_all_layers(
                    network)  # 22 layers
                if pretrained_w_path.endswith('pkl'):
                    # load reference_net
                    # use case: weights initialized from pre-trained reference nets
                    f = open(pretrained_w_path, 'r')
                    w_list = pickle.load(f)  # list of 11 (W,b)-pairs
                    f.close()

                    lasagne.layers.set_all_param_values(
                        layer_list[-3], w_list[:-2])
                    # exclude (W,b) of fc8
                    # BIG NOTE: don't be confused, it's pure coincident that layer_list
                    # and w_list have the same index here. The last element of layer_list are
                    # [.., fc6, drop6, fc7, drop7, fc8], while w_list are
                    # [..., W, b, W, b, W, b] which, eg w_list[-4] and w_list[-3] correspond to
                    # params that are associated with fc7 i.e. params that connect drop6 to fc7

                elif pretrained_w_path.endswith('npz'):
                    # load self-trained net
                    # use case: continue training from a snapshot model
                    with np.load(
                            pretrained_w_path
                    ) as f:  # NOTE: only load snapshot of the same `seed`
                        w_list = [f['arr_%d' % i] for i in range(len(f.files))]
                    lasagne.layers.set_all_param_values(network, w_list)

                elif pretrained_w_path.endswith(
                        '/'):  # init from 1 of the 30 snapshots
                    from os import listdir
                    import re
                    files = [
                        f for f in listdir(pretrained_w_path)
                        if osp.isfile(osp.join(pretrained_w_path, f))
                    ]
                    for file_name in files:
                        regex_seed = 'full%d_' % seed
                        match_seed = re.search(regex_seed, file_name)
                        if match_seed:
                            regex = r"\d+[a-zA-Z]+\d+[a-zA-Z]+\d+\_\d+"
                            match = re.search(regex, file_name)
                            snapshot_name = match.group(0)
                            print snapshot_name
                            with np.load(
                                    osp.join(pretrained_w_path, snapshot_name)
                                    + '.npz') as f:
                                w_list = [
                                    f['arr_%d' % i]
                                    for i in range(len(f.files))
                                ]
                            lasagne.layers.set_all_param_values(
                                network, w_list)

            # START MODULE 1
            module1_time = 0
            if do_module1:
                print 'MODULE 1'
                training_history = {}
                training_history['iter_training_loss'] = []
                training_history['iter_validation_loss'] = []
                training_history['training_loss'] = []
                training_history['validation_loss'] = []
                training_history['learning_rate'] = []

                # http://deeplearning.net/tutorial/gettingstarted.html#early-stopping
                # early-stopping parameters
                n_train_batches = Xaug_train.shape[0] / batchsize
                if Xaug_train.shape[0] % batchsize != 0:
                    n_train_batches += 1
                patience = patience  # look as this many examples regardless
                patience_increase = 2  # wait this much longer when a new best is found
                lr_patience_increase = 1.01
                improvement_threshold = 0.995  # a relative improvement of this much is
                # considered significant; a significant test
                # MIGHT be better
                validation_frequency = min(n_train_batches, patience / 2)
                # go through this many
                # minibatches before checking the network
                # on the validation set; in this case we
                # check every epoch
                best_params = None
                epoch_validation_loss = 0  # indicates that valid_loss has not been computed yet
                best_validation_loss = np.inf
                best_iter = -1
                lr_iter = -1
                test_score = 0.
                start_time = time.time()
                done_looping = False
                epoch = 0

                # Finally, launch the training loop.
                print("Starting training...")
                # We iterate over epochs:
                print(
                    "\nEpoch\tTrain Loss\tValid Loss\tBest-ValLoss-and-Iter\tTime\tL.Rate"
                )
                sys.setrecursionlimit(10000)

                try:  # Early-stopping implementation
                    while (not done_looping) and (epoch < num_epochs):
                        # In each epoch, we do a full pass over the training data:
                        train_err = 0
                        train_batches = 0
                        start_time = time.time()
                        for batch in iterate_minibatches(Xaug_train,
                                                         Yaug_train,
                                                         batchsize,
                                                         shuffle=True):
                            inputs, targets = batch
                            # Horizontal flip half of the images
                            bs = inputs.shape[0]
                            indices = np.random.choice(bs,
                                                       bs / 2,
                                                       replace=False)
                            inputs[indices] = inputs[indices, :, :, ::-1]

                            # Substract mean image
                            inputs = (inputs - MEAN_IMG).astype(
                                theano.config.floatX)
                            # MEAN_IMG is broadcasted numpy-way, take note if want theano expression instead

                            train_err_batch = bwd_fn(inputs, targets)
                            train_err += train_err_batch
                            train_batches += 1

                            iter_now = epoch * n_train_batches + train_batches
                            training_history['iter_training_loss'].append(
                                train_err_batch)
                            training_history['iter_validation_loss'].append(
                                epoch_validation_loss)

                            if (iter_now + 1) % validation_frequency == 0:
                                # a full pass over the validation data:
                                val_err = 0
                                #zero_one_err = 0
                                val_batches = 0
                                for batch in iterate_minibatches(
                                        Xaug_val,
                                        Yaug_val,
                                        batchsize,
                                        shuffle=False):
                                    inputs, targets = batch
                                    # Substract mean image
                                    inputs = (inputs - MEAN_IMG).astype(
                                        theano.config.floatX)
                                    # MEAN_IMG is broadcasted numpy-way, take note if want theano expression instead

                                    val_err_batch = fwd_fn(inputs, targets)
                                    val_err += val_err_batch
                                    val_batches += 1
                                epoch_validation_loss = val_err / val_batches
                                if epoch_validation_loss < best_validation_loss:
                                    if epoch_validation_loss < best_validation_loss * improvement_threshold:
                                        patience = max(
                                            patience,
                                            iter_now * patience_increase)
                                        # lr_patience *= lr_patience_increase

                                    best_params = lasagne.layers.get_all_param_values(
                                        network)
                                    best_validation_loss = epoch_validation_loss
                                    best_iter = iter_now
                                    lr_iter = best_iter

                                else:  # decay learning rate if optim=='momentum'
                                    if optim == 'momentum' and (
                                            iter_now - lr_iter) > lr_patience:
                                        lr.set_value(lr.get_value() * lr_decay)
                                        lr_iter = iter_now

                            if patience <= iter_now:
                                done_looping = True
                                break

                        # Record training history
                        training_history['training_loss'].append(train_err /
                                                                 train_batches)
                        training_history['validation_loss'].append(
                            epoch_validation_loss)
                        training_history['learning_rate'].append(
                            lr.get_value())

                        epoch_time = time.time() - start_time
                        module1_time += epoch_time
                        # Then we print the results for this epoch:
                        print("{}\t{:.6f}\t{:.6f}\t{:.6f}\t{}\t{:.3f}\t{}".
                              format(epoch + 1,
                                     training_history['training_loss'][-1],
                                     training_history['validation_loss'][-1],
                                     best_validation_loss, best_iter + 1,
                                     epoch_time,
                                     training_history['learning_rate'][-1]))

                        if (
                                epoch + 1
                        ) % snapshot == 0:  # TODO try to save weights at best_iter
                            snapshot_path_string = snapshot_root + snapshot_name + '_' + str(
                                iter_now + 1)
                            try:  # use case: terminate experiment before reaching `reps`
                                np.savez(snapshot_path_string + '.npz',
                                         *best_params)
                                np.savez(snapshot_path_string + '_history.npz',
                                         training_history)
                                plot_loss(training_history,
                                          snapshot_path_string + '_loss.png')
                                # plot_conv_weights(lasagne.layers.get_all_layers(network)[1],
                                #     snapshot_path_string+'_conv1weights_')
                            except KeyboardInterrupt, TypeError:
                                print 'Did not save', snapshot_name + '_' + str(
                                    iter_now + 1)
                                pass

                        epoch += 1

                except KeyboardInterrupt, MemoryError:  # Sadly this can only catch KeyboardInterrupt
                    pass
                print 'Training finished or KeyboardInterrupt (Training is never finished, only abandoned)'

                module1_time_eff = module1_time / iter_now * best_iter
                print('Total and Effective training time are {:.0f} and {:.0f}'
                      ).format(module1_time, module1_time_eff)
                time_profiles['train_module1'].append(module1_time)
                time_profiles['train_module1_eff'].append(module1_time_eff)

                # Save model after num_epochs or KeyboardInterrupt
                if (epoch + 1) % snapshot != 0:  # to avoid duplicate save
                    snapshot_path_string = snapshot_root + snapshot_name + '_' + str(
                        iter_now + 1)
                    if not toy:
                        try:  # use case: terminate experiment before reaching `reps`
                            print 'Saving model...'
                            np.savez(snapshot_path_string + '.npz',
                                     *best_params)
                            np.savez(snapshot_path_string + '_history.npz',
                                     training_history)
                            plot_loss(training_history,
                                      snapshot_path_string + '_loss.png')
                            # plot_conv_weights(lasagne.layers.get_all_layers(network)[1],
                            #     snapshot_path_string+'_conv1weights_')
                        except KeyboardInterrupt, TypeError:
                            print 'Did not save', snapshot_name + '_' + str(
                                iter_now + 1)
                            pass
                # And load them again later on like this:
                #with np.load('../snapshot_models/23alex16042023213910.npz') as f:
                #    param_values = [f['arr_%d' % i] for i in range(len(f.files))] # or
                #    training_history = f['arr_0'].items()
                # lasagne.layers.set_all_param_values(network, param_values)

            # END OF MODULE 1

            # START MODULE 2
            print '\nMODULE 2'
            if not do_module1:
                if pretrained_w_path.endswith('pkl'):
                    snapshot_name = str(
                        num_classes
                    ) + 'alexOTS'  # short for "off-the-shelf init"

                elif pretrained_w_path.endswith(
                        'npz'):  # Resume from a SINGLE snapshot
                    # extract name pattern, e.g. '23alex16042023213910full10'
                    # from string '../snapshot_models/23alex16042023213910full10_100.npz'
                    import re
                    regex = r"\d+[a-zA-Z]+\d+[a-zA-Z]+\d+"
                    match = re.search(regex, pretrained_w_path)
                    snapshot_name = match.group(0)

                elif pretrained_w_path.endswith(
                        '/'):  # RESUMED FROM TRAINED MODULE 1 (ONE-TIME USE)
                    from os import listdir
                    import re
                    files = [
                        f for f in listdir(pretrained_w_path)
                        if osp.isfile(osp.join(pretrained_w_path, f))
                    ]
                    for file_name in files:
                        regex_seed = 'full%d_' % seed
                        match_seed = re.search(regex_seed, file_name)
                        if match_seed:
                            regex = r"\d+[a-zA-Z]+\d+[a-zA-Z]+\d+\_\d+"
                            match = re.search(regex, file_name)
                            snapshot_name = match.group(0)
                            print snapshot_name
                            with np.load(
                                    osp.join(pretrained_w_path, snapshot_name)
                                    + '.npz') as f:
                                w_list = [
                                    f['arr_%d' % i]
                                    for i in range(len(f.files))
                                ]
                            lasagne.layers.set_all_param_values(
                                network, w_list)

            else:  # MAIN BRANCH - assume do_module1 is True AND have run `snapshot` epochs
                if (epoch + 1) > snapshot:
                    with np.load(snapshot_path_string + '.npz'
                                 ) as f:  # reload the best params for module 1
                        w_list = [f['arr_%d' % i] for i in range(len(f.files))]
                    lasagne.layers.set_all_param_values(network, w_list)

            score_train = compute_score(Xaug_train_val, Yaug_train_val)
            start_time = time.time()

            if load_t:
                from os import listdir
                import re
                if not pretrained_w_path.endswith('/'):
                    files = [pretrained_w_path]
                else:
                    files = [
                        f for f in listdir(pretrained_w_path)
                        if osp.isfile(osp.join(pretrained_w_path, f))
                    ]
                for file_name in files:
                    regex_seed = '{0}{1}'.format(full_or_tune, seed)
                    match_seed = re.search(regex_seed, file_name)
                    if match_seed:
                        regex = r"\d+[a-zA-Z]+\d+[a-zA-Z]+\d+\_\d+"
                        match = re.search(regex, file_name)
                        snapshot_name = match.group(0)
                        t_train = np.load(
                            osp.join('t', '{0}.npy'.format(snapshot_name)))

            else:  # MAIN BRANCH
                thresholds = Threshold(score_train, Yaug_train_val)
                thresholds.find_t_for(
                )  # determine t_train for each score_train. It will take a while
                t_train = np.asarray(thresholds.t)
                print 't_train is in ', t_train.min(), '..', t_train.max()
                # `thresholds` holds t_train vector in .t attribute
                print('t_train produced in {:.3f}s').format(time.time() -
                                                            start_time)
                np.save('t/' + snapshot_name + '.npy', t_train)

            # Predictive model for t
            regr = linear_model.RidgeCV(cv=5)
            # Ridge() is LinearClassifier() with L2-reg
            regr.fit(score_train, t_train)

            time_profiles['train_module2'].append(time.time() - start_time)
            # END OF MODULE 2

            # TESTING PHASE
            start_time = time.time()
            score_test = compute_score(X_test, Y_test)
            t_test = regr.predict(score_test)
            print 'original t_test is in ', min(t_test), '..', max(t_test)
            t_test[t_test > 1] = max(t_test[t_test < 1])
            t_test[t_test < 0] = min(
                t_test[t_test > 0])  # ! Keep t_test in [0,1]
            print 'corrected t_test is in ', min(t_test), '..', max(t_test)
Ejemplo n.º 8
0
def main(reps, pretrained_w_path, do_module1, init_seed=0, load_t=0, num_epochs=200,
    batchsize=96, fine_tune=0, patience=500, lr_init = 1e-3, optim='adagrad', toy=0,
    num_classes=23):
    res_root = '/home/hoa/Desktop/projects/resources'
    X_path=osp.join(res_root, 'datasets/msrcv2/Xaug_b01c.npy')
    Y_path=osp.join(res_root, 'datasets/msrcv2/Y.npy')
    MEAN_IMG_PATH=osp.join(res_root, 'models/ilsvrc_2012_mean.npy')
    snapshot=50 # save model after every `snapshot` epochs
    
    drop_p=0.5 # drop out prob.
    lambda2=0.0005/2 # l2-regularizer constant    
    # step=patience/4 # decay learning after every `step` epochs
    lr_patience=60 # for learning rate schedule, if optim=='momentum'    
    if toy: # unit testing
        num_epochs=10
        data_multi=3
        reps = 2        
        #drop_p=0
        #lambda2=0
    
    # Create name tag for the experiment
    if fine_tune:
        full_or_tune = 'tune' # description tag for storing associated files
    else:
        full_or_tune = 'full'
    time_stamp=time.strftime("%y%m%d%H%M%S", time.localtime()) 
    snapshot_root = '../snapshot_models/'
    snapshot_name = str(num_classes)+'alex'+time_stamp+full_or_tune
    
    # LOADING DATA
    print 'LOADING DATA ...'
    X = np.load(X_path)
    Y = np.load(Y_path)
    if X.shape[1]!=3:
        X = b01c_to_bc01(X)
    N = len(Y)

    print 'Raw X,Y shape', X.shape, Y.shape
    if len(X) != len(Y):
        print 'Inconsistent number of input images and labels. X is possibly augmented.'
    
    MEAN_IMG = np.load(MEAN_IMG_PATH)
    MEAN_IMG_227 = skimage.transform.resize(
            np.swapaxes(np.swapaxes(MEAN_IMG,0,1),1,2), (227,227), mode='nearest', preserve_range=True)    
    MEAN_IMG = np.swapaxes(np.swapaxes(MEAN_IMG_227,1,2),0,1).reshape((1,3,227,227))

    all_metrics = [] # store metrics in each run
    time_profiles = {
    'train_module1': [],
    'train_module1_eff': [],
    'train_module2': [],
    'test': []
    } # record training and testing time
   
     # PREPARE THEANO EXPRESSION FOR BOTH MODULES
    print 'COMPILING THEANO EXPRESSION ...'
    input_var = T.tensor4('inputs')
    target_var = T.imatrix('targets')        
    network = build_model(num_classes=num_classes, input_var=input_var)    

    # Create a loss expression for training
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.binary_crossentropy(prediction, target_var) 
    weights = lasagne.layers.get_all_params(network, regularizable=True)
    l2reg = theano.shared(floatX(lambda2))*T.sum([T.sum(w ** 2) for w in weights])
    loss = loss.mean() + l2reg
    
    lr = theano.shared(np.array(lr_init, dtype=theano.config.floatX))
    lr_decay = np.array(1./3, dtype=theano.config.floatX)
    
    # Create update expressions for training
    params = lasagne.layers.get_all_params(network, trainable=True)
    # last-layer case is actually very simple:
    # `params` above is a list of all (W,b)-pairs
    # Therefore last layer's (W,b) is params[-2:]
    if fine_tune == 7: # tuning params from fc7 to fc8
        params = params[-2:] 
    # elif fine_tune == 6: # tuning params from fc6 to fc8
    #     params = params[-4:]
    # TODO adjust for per-layer training with local_lr
    
    if optim=='momentum':
        updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=lr, momentum=0.9) 
    elif optim=='rmsprop':
        updates = lasagne.updates.rmsprop(loss, params, learning_rate=lr, rho=0.9, epsilon=1e-06) 
    elif optim=='adam':
        updates = lasagne.updates.adam(
            loss, params, learning_rate=lr, beta1=0.9, beta2=0.999, epsilon=1e-08)
    elif optim=='adagrad':
        updates = lasagne.updates.adagrad(loss, params, learning_rate=lr, epsilon=1e-06)

    # Create a loss expression for validation/testing
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.binary_crossentropy(test_prediction,
                                                            target_var)
    test_loss = test_loss.mean() + l2reg
    # zero-one loss with threshold t = 0.5 for reference
    # zero_one_loss = T.abs_((test_prediction > theano.shared(floatX(0.5))) - target_var).sum(axis=1)
    #zero_one_loss /= target_var.shape[1].astype(theano.config.floatX)
    #zero_one_loss = zero_one_loss.mean()
    
    # Compile a function performing a backward pass (training step)  on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    bwd_fn = theano.function([input_var, target_var], loss, updates=updates,)
    # Compile a second function performing a forward pass, 
    # returns validation loss, 0/1 Error, score i.e. Xout:
    fwd_fn = theano.function([input_var, target_var], test_loss)

    # Create a theano function for computing score
    score = lasagne.layers.get_output(network, deterministic=True)
    score_fn = theano.function([input_var], score)

    def compute_score(X, Y, batchsize=batchsize, shuffle=False):
        out = np.zeros(Y.shape)
        batch_id = 0
        for batch in iterate_minibatches(X, Y, batchsize, shuffle=False):
            inputs, _ = batch
            # Flip random half of the batch
            flip_idx = np.random.choice(len(inputs),size=len(inputs)/2,replace=False)
            if len(flip_idx)>1:
                inputs[flip_idx] = inputs[flip_idx,:,:,::-1]
            # Substract mean image
            inputs = (inputs - MEAN_IMG).astype(theano.config.floatX) 
            # MEAN_IMG is broadcasted numpy-way, take note if want theano expression instead
            if len(inputs)==batchsize:
                out[batch_id*batchsize : (batch_id+1)*batchsize] = score_fn(inputs)
                batch_id += 1
            else:
                out[batch_id*batchsize : ] = score_fn(inputs)
                
        return out

    try:
        #  MAIN LOOP FOR EACH RUN    
        for seed in np.arange(reps)+init_seed:            
            # reset learning rate
            lr.set_value(lr_init)

            print '\nRUN', seed, '...'
            # Split train/val/test set
            indicies = np.arange(len(Y))
            Y_train_val, Y_test, idx_train_val, idx_test = train_test_split(
                Y, indicies, random_state=seed, train_size=float(2)/3)
            Y_train, Y_val, idx_train, idx_val = train_test_split(
                Y_train_val, idx_train_val, random_state=seed)
            
            print "Train/val/test set size:",len(idx_train),len(idx_val),len(idx_test)

            idx_aug_train = data_aug(idx_train, mode='aug', isMat='idx', N=N)
            Xaug_train = X[idx_aug_train]
            Yaug_train = data_aug(Y_train, mode='aug', isMat='Y', N=N)

            idx_aug_val = data_aug(idx_val, mode='aug', isMat='idx', N=N)
            Xaug_val = X[idx_aug_val]
            Yaug_val = data_aug(Y_val, mode='aug', isMat='Y', N=N)

            # Module 2 training set is composed of module 1 training and validation set 
            idx_aug_train_val = data_aug(idx_train_val, mode='aug', isMat='idx', N=N)
            Xaug_train_val = X[idx_aug_train_val]
            Yaug_train_val = data_aug(Y_train_val, mode='aug', isMat='Y', N=N)

            # Test set
            X_test = X[idx_test]
            # Y_test is already returned in the first train_test_split

            print "Augmented train/val/test set size:",len(Xaug_train),len(Yaug_val), len(X_test)
            print "Augmented (X,Y) dtype:", Xaug_train.dtype, Yaug_val.dtype
            print "Processed Mean image:",MEAN_IMG.dtype,MEAN_IMG.shape

            if toy: # try to overfit a tiny subset of the data
                Xaug_train = Xaug_train[:batchsize*data_multi + batchsize/2]
                Yaug_train = Yaug_train[:batchsize*data_multi + batchsize/2]
                Xaug_val = Xaug_val[:batchsize + batchsize/2]
                Yaug_val = Yaug_val[:batchsize + batchsize/2]

            # Init by pre-trained weights, if any
            if len(pretrained_w_path)>0:
                layer_list = lasagne.layers.get_all_layers(network) # 22 layers
                if pretrained_w_path.endswith('pkl'): 
                # load reference_net
                # use case: weights initialized from pre-trained reference nets                
                    f = open(pretrained_w_path, 'r')
                    w_list = pickle.load(f) # list of 11 (W,b)-pairs
                    f.close()
                    
                    lasagne.layers.set_all_param_values(layer_list[-3], w_list[:-2]) 
                    # exclude (W,b) of fc8
                    # BIG NOTE: don't be confused, it's pure coincident that layer_list 
                    # and w_list have the same index here. The last element of layer_list are 
                    # [.., fc6, drop6, fc7, drop7, fc8], while w_list are 
                    # [..., W, b, W, b, W, b] which, eg w_list[-4] and w_list[-3] correspond to
                    # params that are associated with fc7 i.e. params that connect drop6 to fc7
                    
                    
                elif pretrained_w_path.endswith('npz'): 
                # load self-trained net 
                # use case: continue training from a snapshot model
                    with np.load(pretrained_w_path) as f: # NOTE: only load snapshot of the same `seed`
                        # w_list = [f['arr_%d' % i] for i in range(len(f.files))] 
                        w_list = [f.items()['arr_%d' % i] for i in range(len(f.files))] # load from bkviz, one-time use
                    lasagne.layers.set_all_param_values(network, w_list)

                elif pretrained_w_path.endswith('/'): # init from 1 of the 30 snapshots
                    from os import listdir
                    import re
                    files = [f for f in listdir(pretrained_w_path) if osp.isfile(osp.join(pretrained_w_path, f))]
                    for file_name in files:
                        regex_seed = 'full%d_' %seed
                        match_seed = re.search(regex_seed, file_name)
                        if match_seed:
                            regex = r"\d+[a-zA-Z]+\d+[a-zA-Z]+\d+\_\d+"
                            match = re.search(regex, file_name)
                            snapshot_name = match.group(0)
                            print snapshot_name
                            with np.load(osp.join(pretrained_w_path,snapshot_name)+'.npz') as f: 
                                w_list = [f['arr_%d' % i] for i in range(len(f.files))] 
                            lasagne.layers.set_all_param_values(network, w_list)

            # START MODULE 1
            module1_time = 0
            if do_module1:
                print 'MODULE 1' 
                training_history={}
                training_history['iter_training_loss'] = []
                training_history['iter_validation_loss'] = []
                training_history['training_loss'] = []
                training_history['validation_loss'] = []
                training_history['learning_rate'] = []
                
                # http://deeplearning.net/tutorial/gettingstarted.html#early-stopping
                # early-stopping parameters
                n_train_batches = Xaug_train.shape[0] / batchsize
                if Xaug_train.shape[0] % batchsize != 0:
                    n_train_batches += 1
                patience = patience  # look as this many examples regardless
                patience_increase = 2     # wait this much longer when a new best is found
                lr_patience_increase = 1.01
                improvement_threshold = 0.995  # a relative improvement of this much is
                                               # considered significant; a significant test
                                               # MIGHT be better
                validation_frequency = min(n_train_batches, patience/2)
                                              # go through this many
                                              # minibatches before checking the network
                                              # on the validation set; in this case we
                                              # check every epoch
                best_params = None
                epoch_validation_loss = 0 # indicates that valid_loss has not been computed yet
                best_validation_loss = np.inf
                best_iter = -1
                lr_iter = -1
                test_score = 0.
                start_time = time.time()
                done_looping = False
                epoch = 0
                
                # Finally, launch the training loop.
                print("Starting training...")
                # We iterate over epochs:
                print("\nEpoch\tTrain Loss\tValid Loss\tBest-ValLoss-and-Iter\tTime\tL.Rate")
                sys.setrecursionlimit(10000)

                try: # Early-stopping implementation
                    while (not done_looping) and (epoch<num_epochs):
                        # In each epoch, we do a full pass over the training data:
                        train_err = 0
                        train_batches = 0
                        start_time = time.time()
                        for batch in iterate_minibatches(Xaug_train, Yaug_train, batchsize, shuffle=True):
                            inputs, targets = batch
                            # Horizontal flip half of the images
                            bs = inputs.shape[0]
                            indices = np.random.choice(bs, bs / 2, replace=False)
                            inputs[indices] = inputs[indices, :, :, ::-1]
                            
                            # Substract mean image
                            inputs = (inputs - MEAN_IMG).astype(theano.config.floatX) 
                            # MEAN_IMG is broadcasted numpy-way, take note if want theano expression instead
                    
                            train_err_batch = bwd_fn(inputs, targets) 
                            train_err += train_err_batch            
                            train_batches += 1
                            
                            iter_now = epoch*n_train_batches + train_batches
                            training_history['iter_training_loss'].append(train_err_batch)
                            training_history['iter_validation_loss'].append(epoch_validation_loss)
                            
                            if (iter_now+1) % validation_frequency == 0:
                                # a full pass over the validation data:       
                                val_err = 0
                                #zero_one_err = 0
                                val_batches = 0
                                for batch in iterate_minibatches(Xaug_val, Yaug_val, batchsize, shuffle=False):
                                    inputs, targets = batch
                                    # Substract mean image
                                    inputs = (inputs - MEAN_IMG).astype(theano.config.floatX) 
                                    # MEAN_IMG is broadcasted numpy-way, take note if want theano expression instead
                                    
                                    val_err_batch = fwd_fn(inputs, targets)
                                    val_err += val_err_batch
                                    val_batches += 1                
                                epoch_validation_loss = val_err / val_batches
                                if epoch_validation_loss < best_validation_loss:
                                    if epoch_validation_loss < best_validation_loss*improvement_threshold:
                                        patience = max(patience, iter_now * patience_increase)
                                        # lr_patience *= lr_patience_increase
                                        
                                    best_params = lasagne.layers.get_all_param_values(network)
                                    best_validation_loss = epoch_validation_loss
                                    best_iter = iter_now
                                    lr_iter = best_iter


                                else: # decay learning rate if optim=='momentum'
                                    if optim=='momentum' and (iter_now - lr_iter) >  lr_patience:
                                        lr.set_value(lr.get_value() * lr_decay) 
                                        lr_iter = iter_now
                            
                            if patience <= iter_now:
                                done_looping = True
                                break
                        
                        # Record training history
                        training_history['training_loss'].append(train_err / train_batches)
                        training_history['validation_loss'].append(epoch_validation_loss)
                        training_history['learning_rate'].append(lr.get_value())

                        epoch_time = time.time() - start_time
                        module1_time += epoch_time
                        # Then we print the results for this epoch:
                        print("{}\t{:.6f}\t{:.6f}\t{:.6f}\t{}\t{:.3f}\t{}".format(
                                epoch+1, 
                                training_history['training_loss'][-1],
                                training_history['validation_loss'][-1],
                                best_validation_loss,
                                best_iter+1,
                                epoch_time,
                                training_history['learning_rate'][-1]
                            ))
                        
                        if (epoch+1)%snapshot==0: # TODO try to save weights at best_iter
                            snapshot_path_string = snapshot_root+snapshot_name+str(seed)+'_'+str(iter_now+1)
                            try: # use case: terminate experiment before reaching `reps`
                                np.savez(snapshot_path_string+'.npz', *best_params)
                                np.savez(snapshot_path_string+'_history.npz', training_history)
                                plot_loss(training_history, snapshot_path_string+'_loss.png')
                                # plot_conv_weights(lasagne.layers.get_all_layers(network)[1], 
                                #     snapshot_path_string+'_conv1weights_')
                            except KeyboardInterrupt, TypeError:
                                print 'Did not save', snapshot_name+str(seed)+'_'+str(iter_now+1)
                                pass

                        epoch += 1

                except KeyboardInterrupt, MemoryError: # Sadly this can only catch KeyboardInterrupt
                    pass
                print 'Training finished or KeyboardInterrupt (Training is never finished, only abandoned)'
                
                module1_time_eff = module1_time / iter_now * best_iter 
                print('Total and Effective training time are {:.0f} and {:.0f}').format(
                    module1_time, module1_time_eff)
                time_profiles['train_module1'].append(module1_time)
                time_profiles['train_module1_eff'].append(module1_time_eff)
                
                # Save model after num_epochs or KeyboardInterrupt
                if (epoch+1)%snapshot!=0: # to avoid duplicate save
                    snapshot_path_string = snapshot_root+snapshot_name+str(seed)+'_'+str(iter_now+1)
                    if not toy:
                        try: # use case: terminate experiment before reaching `reps`
                            print 'Saving model...'
                            np.savez(snapshot_path_string+'.npz', *best_params)
                            np.savez(snapshot_path_string+'_history.npz', training_history)
                            plot_loss(training_history, snapshot_path_string+'_loss.png')
                            # plot_conv_weights(lasagne.layers.get_all_layers(network)[1], 
                            #     snapshot_path_string+'_conv1weights_')
                        except KeyboardInterrupt, TypeError:
                            print 'Did not save', snapshot_name+str(seed)+'_'+str(iter_now+1)
                            pass
                # And load them again later on like this:
                #with np.load('../snapshot_models/23alex16042023213910.npz') as f:
                #    param_values = [f['arr_%d' % i] for i in range(len(f.files))] # or
                #    training_history = f['arr_0'].items()
                # lasagne.layers.set_all_param_values(network, param_values)                
            
            # END OF MODULE 1             
                
            # START MODULE 2
            print '\nMODULE 2' 
            if not do_module1:
                if pretrained_w_path.endswith('pkl'):
                    snapshot_name = str(num_classes)+'alexOTS' # short for "off-the-shelf init"
                
                elif pretrained_w_path.endswith('npz'): # Resume from a SINGLE snapshot
                    # extract name pattern, e.g. '23alex16042023213910full10' 
                    # from string '../snapshot_models/23alex16042023213910full10_100.npz'
                    import re
                    regex = r"\d+[a-zA-Z]+\d+[a-zA-Z]+\d+"
                    match = re.search(regex, pretrained_w_path)
                    snapshot_name = match.group(0)
                
                elif pretrained_w_path.endswith('/'): # RESUMED FROM TRAINED MODULE 1 (ONE-TIME USE)
                    from os import listdir
                    import re
                    files = [f for f in listdir(pretrained_w_path) if osp.isfile(osp.join(pretrained_w_path, f))]
                    for file_name in files:
                        regex_seed = 'full%d_' %seed
                        match_seed = re.search(regex_seed, file_name)
                        if match_seed:
                            regex = r"\d+[a-zA-Z]+\d+[a-zA-Z]+\d+\_\d+"
                            match = re.search(regex, file_name)
                            snapshot_name = match.group(0)
                            print snapshot_name
                            with np.load(osp.join(pretrained_w_path,snapshot_name)+'.npz') as f: 
                                w_list = [f['arr_%d' % i] for i in range(len(f.files))] 
                            lasagne.layers.set_all_param_values(network, w_list)

            else: # MAIN BRANCH - assume do_module1 is True AND have run `snapshot` epochs
                if (epoch+1)>snapshot: 
                    with np.load(snapshot_path_string+'.npz') as f: # reload the best params for module 1 
                        w_list = [f['arr_%d' % i] for i in range(len(f.files))] 
                    lasagne.layers.set_all_param_values(network, w_list)
           
            score_train = compute_score(Xaug_train_val, Yaug_train_val)
            start_time = time.time()

            if load_t: # Server failed at the wrong time. We only have t backed-up
                if pretrained_w_path.endswith('/'):
                    from os import listdir
                    import re
                    files = [f for f in listdir(pretrained_w_path) if osp.isfile(osp.join(pretrained_w_path, f))]
                    for file_name in files:
                        regex_seed = 'full%d_' %seed
                        match_seed = re.search(regex_seed, file_name)
                        if match_seed:
                            regex = r"\d+[a-zA-Z]+\d+[a-zA-Z]+\d+\_\d+"
                            match = re.search(regex, file_name)
                            snapshot_name = match.group(0)
                            t_train = np.load(osp.join('t','{0}.npy'.format(snapshot_name)))

            else: # MAIN BRANCH
                thresholds = Threshold(score_train, Yaug_train_val)
                thresholds.find_t_for() # determine t_train for each score_train. It will take a while
                t_train = np.asarray(thresholds.t)
                print 't_train is in ', t_train.min(), '..', t_train.max() 
                # `thresholds` holds t_train vector in .t attribute
                print('t_train produced in {:.3f}s').format(time.time()-start_time)
                np.save('t/'+snapshot_name+str(seed)+'.npy', t_train)

            
            # Predictive model for t
            regr = linear_model.RidgeCV(cv=5) 
            # Ridge() is LinearClassifier() with L2-reg
            regr.fit(score_train, t_train) 

            time_profiles['train_module2'].append(time.time()-start_time)
            # END OF MODULE 2        

            # TESTING PHASE
            start_time = time.time()
            score_test = compute_score(X_test, Y_test)
            t_test = regr.predict(score_test)
            print 'original t_test is in ', min(t_test), '..', max(t_test)
            t_test[t_test>1] = max(t_test[t_test<1])
            t_test[t_test<0] = min(t_test[t_test>0]) # ! Keep t_test in [0,1]
            print 'corrected t_test is in ', min(t_test), '..', max(t_test) 
            
            # Predict label 
            metrics = predict_label(score_test, Y_test, t_test, seed, num_classes, verbose=1)        
            time_profiles['test'].append(time.time()-start_time)

            all_metrics.append(metrics)