Beispiel #1
0
    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, meta_vect, i_iter):
            (train_data, train_labels, L2_vect) = meta
            return loss_fun(w, train_data, train_labels, L2_vect)
            #return loss_fun(w, train_data['X'], train_data['T'], L2_vect + np.sum(fake_data.ravel()))

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
        #        learning_curve_dict['learning_curve'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))


        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        fake_data = cur_hyperparams['fake_data']
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(fixed_hyperparams['log_alphas'])
        betas  = logit(fixed_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        meta = kylist(fake_data, fake_labels, L2_reg)
        W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, meta),
                           parser, callback=callback)
        cur_primal_results['weights'] = getval(W_opt).copy()
        cur_primal_results['learning_curve'] = getval(learning_curve_dict)
        return W_opt, learning_curve_dict
Beispiel #2
0
    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, meta_vect, i_iter):
            (train_data, train_labels, L2_vect) = meta
            return loss_fun(w, train_data, train_labels, L2_vect)
            #return loss_fun(w, train_data['X'], train_data['T'], L2_vect + np.sum(fake_data.ravel()))

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                #        learning_curve_dict['learning_curve'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        fake_data = cur_hyperparams['fake_data']
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(fixed_hyperparams['log_alphas'])
        betas = logit(fixed_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        meta = kylist(fake_data, fake_labels, L2_reg)
        W_opt = sgd_parsed(grad(indexed_loss_fun),
                           kylist(W0, alphas, betas, meta),
                           parser,
                           callback=callback)
        cur_primal_results['weights'] = getval(W_opt).copy()
        cur_primal_results['learning_curve'] = getval(learning_curve_dict)
        return W_opt, learning_curve_dict
Beispiel #3
0
def primal_optimizer(hyperparams_vect, meta_epoch):
    def indexed_loss_fun(w, L2_vect, i_iter):
        rs = RandomState(
            (seed, meta_epoch,
             i_iter))  # Deterministic seed needed for backwards pass.
        idxs = rs.randint(N_train, size=batch_size)
        return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs],
                        L2_vect)

    cur_hyperparams = hyperparams.new_vect(hyperparams_vect)

    rs = RandomState((seed, meta_epoch))

    # Randomly initialize weights
    W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale']))
    W0 *= rs.randn(W0.size)
    # Init regularization term
    L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
    # Set step sizes
    alphas = np.exp(cur_hyperparams['log_alphas'])
    # Momentum terms
    betas = logit(cur_hyperparams['invlogit_betas'])

    # Train model
    W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas,
                                                      L2_reg), parser)

    cur_primal_results['weights'] = getval(W_opt).copy()
    return W_opt
Beispiel #4
0
 def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True):
     RS = RandomState((seed, i_hyper, i_primal))
     idxs = RS.permutation(N_train)[:batch_size]
     minibatch = dictslice(train_data, idxs)
     loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty)
     if verbose and i_primal % 10 == 0: print "Iter {0}, loss, {1}".format(i_primal, getval(loss))
     return loss
    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState((seed, i_hyper, i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data["X"][idxs], train_data["T"][idxs], L2_vect)

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict["learning_curve"].append(loss_fun(x, **train_data))
                learning_curve_dict["grad_norm"].append(np.linalg.norm(g))
                learning_curve_dict["weight_norm"].append(np.linalg.norm(x))
                learning_curve_dict["velocity_norm"].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams["log_param_scale"]))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams["log_alphas"])
        betas = logit(cur_hyperparams["invlogit_betas"])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams["log_L2_reg"]))
        W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback)
        # callback(W_opt, N_iters)
        return W_opt, learning_curve_dict
Beispiel #6
0
    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState(
                (seed, i_hyper,
                 i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs],
                            L2_vect)

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(
                    loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun),
                           kylist(W0, alphas, betas, L2_reg),
                           parser,
                           callback=callback)
        return W_opt, learning_curve_dict
Beispiel #7
0
    def hyperloss(hyperparam_vect,
                  i_hyper,
                  alphabets,
                  verbose=True,
                  report_train_loss=False):
        RS = RandomState((seed, i_hyper, "hyperloss"))
        alphabet = shuffle_alphabet(RS.choice(alphabets), RS)
        N_train = alphabet['X'].shape[0] - N_valid_dpts
        train_data = dictslice(alphabet, slice(None, N_train))
        if report_train_loss:
            valid_data = dictslice(alphabet, slice(None, N_valid_dpts))
        else:
            valid_data = dictslice(alphabet, slice(N_train, None))

        def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True):
            RS = RandomState((seed, i_hyper, i_primal))
            idxs = RS.permutation(N_train)[:batch_size]
            minibatch = dictslice(train_data, idxs)
            loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty)
            if verbose and i_primal % 10 == 0:
                print "Iter {0}, loss, {1}".format(i_primal, getval(loss))
            return loss

        W0 = RS.randn(N_weights) * initialization_scale
        W_final = sgd(grad(primal_loss),
                      hyperparam_vect,
                      W0,
                      alpha,
                      beta,
                      N_iters,
                      callback=None)
        return reg_loss_fun(W_final,
                            valid_data,
                            hyperparam_vect,
                            reg_penalty=False)
Beispiel #8
0
    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState((seed, i_hyper, i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0 or i_iter == N_iters or i_iter == 0:
                learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))
                learning_curve_dict['iteration'].append(i_iter + 1)
                print "iteration", i_iter

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas  = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg),
                           parser, callback=callback)
        return W_opt, learning_curve_dict
Beispiel #9
0
 def indexed_loss_fun(w, L2_vect, i_iter):
     rs = RandomState(
         (seed, i_hyper,
          i_iter))  # Deterministic seed needed for backwards pass.
     idxs = rs.randint(N_train, size=batch_size)
     return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs],
                     L2_vect)
 def hyperloss2(transform, i_hyper, cur_train_data, cur_valid_data,
                cur_tests_data):
     RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
     z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
     z_vect_final = train_z2(cur_train_data, z_vect_0, transform)
     w_vect_final = transform_weights(z_vect_final, transform)
     return loss_fun(w_vect_final, **cur_valid_data)
Beispiel #11
0
def run(superparams):
    alpha, log_scale_init, offset_init_std = superparams
    RS = RandomState((seed, "top_rs"))
    all_alphabets = omniglot.load_data()
    RS.shuffle(all_alphabets)
    train_alphabets = all_alphabets[:-N_test_alphabets]
    tests_alphabets = all_alphabets[-N_test_alphabets:]
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size
    hyperparams_0 = VectorParser()
    hyperparams_0['log_scale']  = log_scale_init * np.ones(N_weights)
    hyperparams_0['offset'] = offset_init_std * RS.randn(N_weights)

    def reg_loss_fun(W, data, hyperparam_vect, reg_penalty):
        hyperparams = hyperparams_0.new_vect(hyperparam_vect)
        Z = np.exp(hyperparams['log_scale']) * W + hyperparams['offset']
        return loss_fun(Z, **data) + np.dot(W, W) * reg_penalty

    def hyperloss(hyperparam_vect, i_hyper, alphabets, verbose=True, report_train_loss=False):
        RS = RandomState((seed, i_hyper, "hyperloss"))        
        alphabet = shuffle_alphabet(RS.choice(alphabets), RS)
        N_train = alphabet['X'].shape[0] - N_valid_dpts
        train_data = dictslice(alphabet, slice(None, N_train))
        if report_train_loss:
            valid_data = dictslice(alphabet, slice(None, N_valid_dpts))
        else:
            valid_data = dictslice(alphabet, slice(N_train, None))
        def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True):
            RS = RandomState((seed, i_hyper, i_primal))
            idxs = RS.permutation(N_train)[:batch_size]
            minibatch = dictslice(train_data, idxs)
            loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty)
            if verbose and i_primal % 30 == 0:
                print "Iter {0}, loss, {1}".format(i_primal, getval(loss))
                
            return loss

        W0 = np.zeros(N_weights)
        W_final = sgd(grad(primal_loss), hyperparam_vect, W0, alpha, beta, N_iters, callback=None)
        return reg_loss_fun(W_final, valid_data, hyperparam_vect, reg_penalty=False)

    results = defaultdict(list)
    def record_results(hyperparam_vect, i_hyper, g):
        # print "Meta iter {0}. Recording results".format(i_hyper)
        RS = RandomState((seed, i_hyper, "evaluation"))
        new_seed = RS.int32()
        def loss_fun(alphabets, report_train_loss):
            return np.mean([hyperloss(hyperparam_vect, new_seed, alphabets=alphabets,
                                      verbose=False, report_train_loss=report_train_loss)
                            for i in range(N_alphabets_eval)])
        cur_hyperparams = hyperparams_0.new_vect(hyperparam_vect.copy())
        if i_hyper % N_hyper_thin == 0:
            # Storing O(N_weights) is a bit expensive so we thin it out and store in low precision
            for field in cur_hyperparams.names:
                results[field].append(cur_hyperparams[field].astype(np.float16))
        results['train_loss'].append(loss_fun(train_alphabets, report_train_loss=True))
        results['valid_loss'].append(loss_fun(train_alphabets, report_train_loss=False))

    record_results(hyperparams_0.vect, 0, None)
    return [results['train_loss'][0], results['valid_loss'][0]]
Beispiel #12
0
    def record_results(hyperparam_vect, i_hyper, g):
        # print "Meta iter {0}. Recording results".format(i_hyper)
        RS = RandomState((seed, i_hyper, "evaluation"))
        new_seed = RS.int32()

        def loss_fun(alphabets, report_train_loss):
            return np.mean([
                hyperloss(hyperparam_vect,
                          new_seed,
                          alphabets=alphabets,
                          verbose=False,
                          report_train_loss=report_train_loss)
                for i in range(N_alphabets_eval)
            ])

        cur_hyperparams = hyperparams_0.new_vect(hyperparam_vect.copy())
        if i_hyper % N_hyper_thin == 0:
            # Storing O(N_weights) is a bit expensive so we thin it out and store in low precision
            for field in cur_hyperparams.names:
                results[field].append(cur_hyperparams[field].astype(
                    np.float16))
        results['train_loss'].append(
            loss_fun(train_alphabets, report_train_loss=True))
        results['valid_loss'].append(
            loss_fun(train_alphabets, report_train_loss=False))
Beispiel #13
0
    def record_results(hyperparam_vect, i_hyper, g):
        # print "Meta iter {0}. Recording results".format(i_hyper)
        RS = RandomState((seed, i_hyper, "evaluation"))
        new_seed = RS.int32()

        def loss_fun(alphabets, report_train_loss):
            return np.mean(
                [
                    hyperloss(
                        hyperparam_vect,
                        new_seed,
                        alphabets=alphabets,
                        verbose=False,
                        report_train_loss=report_train_loss,
                    )
                    for i in range(N_alphabets_eval)
                ]
            )

        cur_hyperparams = hyperparams_0.new_vect(hyperparam_vect.copy())
        if i_hyper % N_hyper_thin == 0:
            # Storing O(N_weights) is a bit expensive so we thin it out and store in low precision
            for field in cur_hyperparams.names:
                results[field].append(cur_hyperparams[field].astype(np.float16))
        results["train_loss"].append(loss_fun(train_alphabets, report_train_loss=True))
        results["valid_loss"].append(loss_fun(train_alphabets, report_train_loss=False))
Beispiel #14
0
 def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True):
     RS = RandomState((seed, i_hyper, i_primal))
     idxs = RS.permutation(N_train)[:batch_size]
     minibatch = dictslice(train_data, idxs)
     loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty)
     if verbose and i_primal % 10 == 0: print "Iter {0}, loss, {1}".format(i_primal, getval(loss))
     return loss
def run(superparams):
    alpha, log_scale_init, offset_init_std = superparams
    RS = RandomState((seed, "top_rs"))
    all_alphabets = omniglot.load_data()
    RS.shuffle(all_alphabets)
    train_alphabets = all_alphabets[:-N_test_alphabets]
    tests_alphabets = all_alphabets[-N_test_alphabets:]
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size
    hyperparams_0 = VectorParser()
    hyperparams_0['log_scale']  = log_scale_init * np.ones(N_weights)
    hyperparams_0['offset'] = offset_init_std * RS.randn(N_weights)

    def reg_loss_fun(W, data, hyperparam_vect, reg_penalty):
        hyperparams = hyperparams_0.new_vect(hyperparam_vect)
        Z = np.exp(hyperparams['log_scale']) * W + hyperparams['offset']
        return loss_fun(Z, **data) + np.dot(W, W) * reg_penalty

    def hyperloss(hyperparam_vect, i_hyper, alphabets, verbose=True, report_train_loss=False):
        RS = RandomState((seed, i_hyper, "hyperloss"))        
        alphabet = shuffle_alphabet(RS.choice(alphabets), RS)
        N_train = alphabet['X'].shape[0] - N_valid_dpts
        train_data = dictslice(alphabet, slice(None, N_train))
        if report_train_loss:
            valid_data = dictslice(alphabet, slice(None, N_valid_dpts))
        else:
            valid_data = dictslice(alphabet, slice(N_train, None))
        def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True):
            RS = RandomState((seed, i_hyper, i_primal))
            idxs = RS.permutation(N_train)[:batch_size]
            minibatch = dictslice(train_data, idxs)
            loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty)
            if verbose and i_primal % 30 == 0:
                print "Iter {0}, loss, {1}".format(i_primal, getval(loss))
                
            return loss

        W0 = np.zeros(N_weights)
        W_final = sgd(grad(primal_loss), hyperparam_vect, W0, alpha, beta, N_iters, callback=None)
        return reg_loss_fun(W_final, valid_data, hyperparam_vect, reg_penalty=False)

    results = defaultdict(list)
    def record_results(hyperparam_vect, i_hyper, g):
        # print "Meta iter {0}. Recording results".format(i_hyper)
        RS = RandomState((seed, i_hyper, "evaluation"))
        new_seed = RS.int32()
        def loss_fun(alphabets, report_train_loss):
            return np.mean([hyperloss(hyperparam_vect, new_seed, alphabets=alphabets,
                                      verbose=False, report_train_loss=report_train_loss)
                            for i in range(N_alphabets_eval)])
        cur_hyperparams = hyperparams_0.new_vect(hyperparam_vect.copy())
        if i_hyper % N_hyper_thin == 0:
            # Storing O(N_weights) is a bit expensive so we thin it out and store in low precision
            for field in cur_hyperparams.names:
                results[field].append(cur_hyperparams[field].astype(np.float16))
        results['train_loss'].append(loss_fun(train_alphabets, report_train_loss=True))
        results['valid_loss'].append(loss_fun(train_alphabets, report_train_loss=False))

    record_results(hyperparams_0.vect, 0, None)
    return [results['train_loss'][0], results['valid_loss'][0]]
 def primal_loss(z_vect, transform, i_primal, record_results=False):
     RS = RandomState((seed, i_primal, "primal"))
     idxs = RS.randint(N_data, size=batch_size)
     minibatch = dictslice(data, idxs)
     w_vect = transform_weights(z_vect, transform)
     loss = loss_fun(w_vect, **minibatch)
     reg = regularization(z_vect)
     return loss + reg
Beispiel #17
0
 def primal_loss(w_vect, reg, i_primal, record_results=False):
     RS = RandomState((seed, i_primal, "primal"))
     idxs = RS.randint(N_data, size=batch_size)
     minibatch = dictslice(data, idxs)
     loss = loss_fun(w_vect, **minibatch)
     reg = regularization(w_vect, reg)
     if record_results and i_primal % N_thin == 0:
         print "Iter {0}: train: {1}".format(i_primal, getval(loss))
     return loss + reg
 def primal_loss(w_vect, reg, i_primal, record_results=False):
     RS = RandomState((seed, i_primal, "primal"))
     idxs = RS.randint(N_data, size=batch_size)
     minibatch = dictslice(data, idxs)
     loss = loss_fun(w_vect, **minibatch)
     reg = regularization(w_vect, reg)
     if record_results and i_primal % N_thin == 0:
         print "Iter {0}: train: {1}".format(i_primal, getval(loss))
     return loss + reg
Beispiel #19
0
 def loss_fun(alphabets, report_train_loss):
     RS = RandomState(
         (seed, "evaluation"))  # Same alphabet with i_hyper now
     return np.mean([
         hyperloss(hyperparam_vect,
                   RS.int32(),
                   alphabets=alphabets,
                   verbose=False,
                   report_train_loss=report_train_loss)
         for i in range(N_alphabets_eval)
     ])
 def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data, exact_metagrad):
     RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
     z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
     z_vect_final = train_z(cur_train_data, z_vect_0, transform, exact_metagrad)
     w_vect_final = transform_weights(z_vect_final, transform)
     #TODO: print/store losses and error rates here
     print "Training loss (unregularized) = " +str(getval(loss_fun(w_vect_final, **cur_train_data)))
     print "Validation loss = " +str(getval(loss_fun(w_vect_final, **cur_valid_data)))
     print "Test loss = " +str(getval(loss_fun(w_vect_final, **tests_data)))
     print "Training error = "+ str(getval(frac_err(w_vect_final, **cur_train_data)))
     print "Validation error = "+ str(getval(frac_err(w_vect_final, **cur_valid_data)))
     print "Test error = "+ str(getval(frac_err(w_vect_final, **tests_data)))
     return loss_fun(w_vect_final, **cur_valid_data)
        def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform)
            w_vect_final = transform_weights(z_vect_final, transform) #TODO: initial scale AND regularization
            
         
            train_loss = getval(loss_fun(w_vect_final, **cur_train_data))
            print "Training loss (unregularized) = " +str(train_loss)
            all_train_loss.append(train_loss)
            valid_loss = getval(loss_fun(w_vect_final, **cur_valid_data))
            print "Validation loss = " +str(valid_loss)
            all_valid_loss.append(valid_loss)
            tests_loss = getval(loss_fun(w_vect_final, **cur_tests_data))
            print "Test loss = " +str(tests_loss)
            all_tests_loss.append(tests_loss)
            
            plt.plot(all_train_loss, label="training loss (unregularized)")
            plt.plot(all_valid_loss, label="validation loss")
            plt.plot(all_tests_loss, label="test loss")
            plt.title("loss vs meta iteration")
            plt.xlabel("meta iteration")
            plt.ylabel("loss")
            plt.legend()
            plt.savefig("loss2000_corrected.png")
            plt.clf()
            
            
            train_rate = getval(frac_err(w_vect_final, **cur_train_data))
            print "Training error rate = " +str(train_rate)
            all_train_rates.append(train_rate)
            valid_rate = getval(frac_err(w_vect_final, **cur_valid_data))
            print "Validation error rate = " +str(valid_rate)
            all_valid_rates.append(valid_rate)
            tests_rate = getval(frac_err(w_vect_final, **cur_tests_data))
            print "Test error rate = " +str(tests_rate)
            all_tests_rates.append(tests_rate)
            
            plt.plot(all_train_rates, label="training error rate")
            plt.plot(all_valid_rates, label="validation error rate")
            plt.plot(all_tests_rates, label="test error rate")
            plt.title("error rate vs meta iteration")
            plt.xlabel("meta iteration")
            plt.ylabel("error rate")
            plt.legend()
            plt.savefig("error2000_corrected.png")
            plt.clf()

            
            return loss_fun(w_vect_final, **cur_valid_data)
 def primal_loss(z_vect, transform, i_primal, record_results=False):
     RS = RandomState((seed, i_primal, "primal"))
     idxs = RS.randint(N_data, size=batch_size)
     minibatch = dictslice(data, idxs)
     w_vect = transform_weights(z_vect, transform) #TODO: this is a scale transformation, not regularization!
     loss = loss_fun(w_vect, **minibatch) #use new scale for prediction
     reg = regularization(z_vect) #regularize original scale
     #TODO: should be equivalent: w = z*e^transform, so 
     # f(z*e^transform) + e^\lambda||z||^2 = f(w) + e^\lambda||z||^2 = f(w) + e^(\lambda)||e^-2transform w||^2
     # see process_transform
     
     #if record_results and i_primal % N_thin == 0:
         #print "Iter {0}: train: {1}".format(i_primal, getval(loss))
     return loss + reg
Beispiel #23
0
 def sub_primal_stochastic_loss(z_vect, transform_vect, i_primal, i_script):
     RS = RandomState((seed, i_hyper, i_primal, i_script))
     N_train = train_data[i_script]['X'].shape[0]
     idxs = RS.permutation(N_train)[:batch_size]
     minibatch = dictslice(train_data[i_script], idxs)
     loss = loss_from_latents(z_vect, transform_vect, i_script, minibatch)
     reg = regularization(z_vect) if i_script == 0 else 0.0
     if i_primal % N_thin == 0 and i_script == 0:
         print "Iter {0}, full losses: train: {1}, valid: {2}, reg: {3}".format(
             i_primal,
             total_loss(train_data, getval(z_vect)),
             total_loss(valid_data, getval(z_vect)),
             getval(reg) / N_scripts_per_iter)
     return loss + reg
Beispiel #24
0
 def sub_primal_stochastic_loss(z_vect, transform_vect, i_primal,
                                i_script):
     RS = RandomState((seed, i_hyper, i_primal, i_script))
     N_train = train_data[i_script]['X'].shape[0]
     idxs = RS.permutation(N_train)[:batch_size]
     minibatch = dictslice(train_data[i_script], idxs)
     loss = loss_from_latents(z_vect, transform_vect, i_script,
                              minibatch)
     reg = regularization(z_vect) if i_script == 0 else 0.0
     if i_primal % N_thin == 0 and i_script == 0:
         print "Iter {0}, full losses: train: {1}, valid: {2}, reg: {3}".format(
             i_primal, total_loss(train_data, getval(z_vect)),
             total_loss(valid_data, getval(z_vect)),
             getval(reg) / N_scripts_per_iter)
     return loss + reg
Beispiel #25
0
    def train_reg(reg_0, constraint, N_meta_iter, i_top):
        def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            w_vect_0 = RS.randn(N_weights) * init_scales
            w_vect_final = train_z(cur_train_data, w_vect_0, reg)
            return loss_fun(w_vect_final, **cur_valid_data)

        hypergrad = grad(hyperloss)
        cur_reg = reg_0
        for i_hyper in range(N_meta_iter):
            if i_hyper % N_meta_thin == 0:
                tests_loss = hyperloss(cur_reg, i_hyper, train_data,
                                       tests_data)
                all_tests_loss.append(tests_loss)
                all_regs.append(cur_reg.copy())
                print "Hyper iter {0}, test loss {1}".format(
                    i_hyper, all_tests_loss[-1])
                print "Cur_reg", np.mean(cur_reg)
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            cur_split = random_partition(train_data, RS,
                                         [N_train - N_valid, N_valid])
            raw_grad = hypergrad(cur_reg, i_hyper, *cur_split)
            constrained_grad = constrain_reg(raw_grad, constraint)
            print constrained_grad
            # cur_reg -= constrained_grad / np.abs(constrained_grad + 1e-8) * meta_alpha
            cur_reg -= constrained_grad * meta_alpha

        return cur_reg
Beispiel #26
0
    def train_reg(reg_0, constraint, N_meta_iter, i_top):
        def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform)
            w_vect_final = transform_weights(z_vect_final, transform)
            return loss_fun(w_vect_final, **cur_valid_data)
        hypergrad = grad(hyperloss)

        def error_rate(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform)
            w_vect_final = transform_weights(z_vect_final, transform)
            return frac_err(w_vect_final, **cur_valid_data)

        cur_reg = reg_0
        for i_hyper in range(N_meta_iter):
            if i_hyper % N_meta_thin == 0:
                test_rate = error_rate(cur_reg, i_hyper, train_data, tests_data)
                all_tests_rates.append(test_rate)
                all_transforms.append(cur_reg.copy())
                all_avg_regs.append(np.mean(cur_reg))
                print "Hyper iter {0}, error rate {1}".format(i_hyper, all_tests_rates[-1])
                print "Cur_transform", np.mean(cur_reg)
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid])
            raw_grad = hypergrad(cur_reg, i_hyper, *cur_split)
            constrained_grad = constrain_reg(raw_grad, constraint)
            cur_reg -= np.sign(constrained_grad) * meta_alpha
        return cur_reg
Beispiel #27
0
def plot():
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    mpl.rcParams['font.family'] = 'serif'
    mpl.rcParams['image.interpolation'] = 'none'
    with open('results.pkl') as f:
        transform_parser, transform_vects, train_losses, tests_losses = pickle.load(f)

    RS = RandomState((seed, "plotting"))
    fig = plt.figure(0)
    fig.clf()
    ax = fig.add_subplot(111)
    omniglot.show_alphabets(omniglot.load_rotated_alphabets(RS, normalize=False, angle=90), ax=ax)
    ax.plot([0, 20 * 28], [5 * 28, 5 * 28], '--k')
    ax.text(-15, 5 * 28 * 3 / 2 - 60, "Rotated alphabets", rotation='vertical')
    plt.savefig("all_alphabets.png")
    # Plotting transformations
    names = ['no_sharing', 'full_sharing', 'learned_sharing']
    title_strings = {'no_sharing'      : 'Independent nets',
                     'full_sharing'    : 'Shared bottom layer',
                     'learned_sharing' : 'Learned sharing'}
    covar_imgs = {name : build_covar_image(transform_vects[name]) for name in names}

    for i, name in enumerate(names):
        fig = plt.figure(0)
        fig.clf()
        fig.set_size_inches((2, 6))
        ax = fig.add_subplot(111)
        ax.matshow(covar_imgs[name], cmap = mpl.cm.binary)
        ax.set_xticks([])
        ax.set_yticks([])
        plt.savefig('learned_corr_{0}.png'.format(i))
        plt.savefig('learned_corr_{0}.pdf'.format(i))
Beispiel #28
0
 def primal_stochastic_loss(z_vect, transform_vect, i_primal):
     RS = RandomState((seed, i_hyper, i_primal))
     loss = 0.0
     for _ in range(N_scripts_per_iter):
         i_script = RS.randint(N_scripts)
         N_train = train_data[i_script]['X'].shape[0]
         idxs = RS.permutation(N_train)[:batch_size]
         minibatch = dictslice(train_data[i_script], idxs)
         loss += loss_from_latents(z_vect, transform_vect, i_script, minibatch)
     reg  = regularization(z_vect)
     if i_primal % 1 == 0:
         print "Iter {0}, loss {1}, reg {2}".format(i_primal, getval(loss), getval(reg))
         print "Full losses: train: {0}, valid: {1}".format(
             total_loss(train_data, getval(z_vect)),
             total_loss(valid_data, getval(z_vect)))
     return loss + reg
Beispiel #29
0
    def train_reg(transform_0, constraint, N_meta_iter, i_top):
        def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform)
            w_vect_final = transform_weights(z_vect_final, transform)
            return loss_fun(w_vect_final, **cur_valid_data)

        hypergrad = grad(hyperloss)
        cur_transform = transform_0
        for i_hyper in range(N_meta_iter):
            if i_hyper % N_meta_thin == 0:
                tests_loss = hyperloss(cur_transform, i_hyper, train_data,
                                       tests_data)
                all_tests_loss.append(tests_loss)
                all_transforms.append(cur_transform.copy())
                print "Hyper iter {0}, test loss {1}".format(
                    i_hyper, all_tests_loss[-1])
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            cur_split = random_partition(train_data, RS,
                                         [N_train - N_valid, N_valid])
            raw_grad = hypergrad(cur_transform, i_hyper, *cur_split)
            constrained_grad = constrain_transform(raw_grad, constraint)
            cur_transform -= constrained_grad * meta_alpha
        return cur_transform
Beispiel #30
0
 def primal_stochastic_loss(z_vect, transform_vect, i_primal):
     RS = RandomState((seed, i_hyper, i_primal))
     loss = 0.0
     for _ in range(N_scripts_per_iter):
         i_script = RS.randint(N_scripts)
         N_train = train_data[i_script]['X'].shape[0]
         idxs = RS.permutation(N_train)[:batch_size]
         minibatch = dictslice(train_data[i_script], idxs)
         loss += loss_from_latents(z_vect, transform_vect, i_script, minibatch)
     reg  = regularization(z_vect)
     if i_primal % 20 == 0:
         print "Iter {0}, loss {1}, reg {2}".format(i_primal, getval(loss), getval(reg))
         print "Full losses: train: {0}, valid: {1}".format(
             total_loss(train_data, getval(z_vect)),
             total_loss(valid_data, getval(z_vect)))
     return loss + reg
    def train_reg(reg_0, constraint, N_meta_iter, i_top, exact_metagrad):
        def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data, exact_metagrad):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform, exact_metagrad)
            w_vect_final = transform_weights(z_vect_final, transform)
            #TODO: print/store losses and error rates here
            print "Training loss (unregularized) = " +str(getval(loss_fun(w_vect_final, **cur_train_data)))
            print "Validation loss = " +str(getval(loss_fun(w_vect_final, **cur_valid_data)))
            print "Test loss = " +str(getval(loss_fun(w_vect_final, **tests_data)))
            print "Training error = "+ str(getval(frac_err(w_vect_final, **cur_train_data)))
            print "Validation error = "+ str(getval(frac_err(w_vect_final, **cur_valid_data)))
            print "Test error = "+ str(getval(frac_err(w_vect_final, **tests_data)))
            return loss_fun(w_vect_final, **cur_valid_data)
        hypergrad = grad(hyperloss) #No chain rule here

            
        '''def error_rate(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform) #TODO: recomputing path?
            w_vect_final = transform_weights(z_vect_final, transform)
            return frac_err(w_vect_final, **cur_valid_data)'''

        cur_reg = reg_0
        for i_hyper in range(N_meta_iter):
            print "Hyper iter "+ str(i_hyper)
            """if i_hyper % N_meta_thin == 0:
                test_rate = error_rate(cur_reg, i_hyper, train_data, tests_data)
                all_tests_rates.append(test_rate)
                all_transforms.append(cur_reg.copy())
                all_avg_regs.append(np.mean(cur_reg))
                print "Hyper iter {0}, error rate {1}".format(i_hyper, all_tests_rates[-1])
                print "Cur_transform", np.mean(cur_reg)"""
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            #cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) #cur_train_data, cur_valid_data
            #raw_grad = hypergrad(cur_reg, i_hyper, *cur_split)
            cur_train_data, cur_valid_data = random_partition(train_data, RS, [N_train - N_valid, N_valid])
            raw_grad = hypergrad(cur_reg, i_hyper, cur_train_data, cur_valid_data, tests_data, exact_metagrad)
            #print "before constraining grad"
            constrained_grad = constrain_reg(raw_grad, constraint)
            # TODO: can put exact hypergradient here, using constraint
            #print "after constraining grad, before constraining exact"
            # TODO: DrMAD norm matches after constraining, but not exact norm?? Why???
            # This one is about 4x larger than constrained one
            print np.linalg.norm(raw_grad)
            print np.linalg.norm(exact_metagrad[0])
            constrained_exact_grad = constrain_reg(exact_metagrad[0], constraint)
            #print "after constraining exact"
            # TODO: compute statistics
            # TODO: sometimes negative???
            print("cosine of angle between DrMAD and exact = "
                +str(np.dot(constrained_grad, constrained_exact_grad)/(np.linalg.norm(constrained_grad)*np.linalg.norm(constrained_exact_grad))))
            print("cosine of angle between signs of DrMAD and exact = "
                +str(np.dot(np.sign(constrained_grad), np.sign(constrained_exact_grad))/len(constrained_grad)))
            print("DrMAD norm = "+ str(np.linalg.norm(constrained_grad)))
            print("Exact norm = "+ str(np.linalg.norm(constrained_exact_grad)))
            cur_reg -= np.sign(constrained_grad) * meta_alpha #TODO: signs of gradient...
            #TODO: momentum
        return cur_reg
Beispiel #32
0
 def hyperloss(transform_vect, i_hyper):
     RS = RandomState((seed, i_hyper, "hyperloss"))
     cur_train_data, cur_valid_data = random_partition(
         train_data, RS, [10, 2])
     z_vect_final = train_z(cur_train_data, transform_vect, RS)
     w_vect_final = transform_weights(z_vect_final, transform_vect)
     return likelihood_loss(w_vect_final, cur_valid_data) / N_scripts
Beispiel #33
0
def plot():
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    mpl.rcParams['font.family'] = 'serif'
    mpl.rcParams['image.interpolation'] = 'none'
    with open('results.pkl') as f:
        transform_parser, transform_vects, train_losses, tests_losses = pickle.load(
            f)

    RS = RandomState((seed, "plotting"))
    fig = plt.figure(0)
    fig.clf()
    ax = fig.add_subplot(111)
    alphabets = omniglot.load_rotated_alphabets(RS, normalize=False, angle=90)
    num_cols = 15
    num_rows = 5
    omniglot.show_alphabets(alphabets, ax=ax, n_cols=num_cols)
    ax.plot([0, num_cols * 28], [num_rows * 28, num_rows * 28], '--k')
    #ax.text(-15, 5 * 28 * 3 / 2 - 60, "Rotated alphabets", rotation='vertical')
    plt.savefig("all_alphabets.png", bbox_inches='tight')

    # Plotting transformations
    names = ['no_sharing', 'full_sharing', 'learned_sharing']
    title_strings = {
        'no_sharing': 'Independent nets',
        'full_sharing': 'Shared bottom layer',
        'learned_sharing': 'Learned sharing'
    }
    covar_imgs = {
        name: build_covar_image(transform_vects[name])
        for name in names
    }

    for model_ix, model_name in enumerate(names):
        image_list = covar_imgs[model_name]
        for layer_ix, image in enumerate(image_list):
            fig = plt.figure(0)
            fig.clf()
            fig.set_size_inches((1, 1))
            ax = fig.add_subplot(111)
            ax.matshow(image, cmap=mpl.cm.binary, vmin=0.0, vmax=1.0)
            ax.set_xticks([])
            ax.set_yticks([])
            plt.savefig('minifigs/learned_corr_{0}_{1}.png'.format(
                model_name, layer_ix),
                        bbox_inches='tight')
            plt.savefig('minifigs/learned_corr_{0}_{1}.pdf'.format(
                model_name, layer_ix),
                        bbox_inches='tight')

    # Write results to a nice latex table for paper.
    with open('results_table.tex', 'w') as f:
        f.write(" & No Sharing & Full Sharing & Learned \\\\\n")
        f.write("Training loss & {:2.2f} & {:2.2f} & {:2.2f} \\\\\n".format(
            train_losses['no_sharing'], train_losses['full_sharing'],
            train_losses['learned_sharing']))
        f.write("Test loss & {:2.2f} & {:2.2f} & \\bf {:2.2f} ".format(
            tests_losses['no_sharing'], tests_losses['full_sharing'],
            tests_losses['learned_sharing']))
Beispiel #34
0
    def hyperloss(hyperparam_vect, i_hyper, alphabets, verbose=True, report_train_loss=False):
        RS = RandomState((seed, i_hyper, "hyperloss"))        
        alphabet = shuffle_alphabet(RS.choice(alphabets), RS)
        N_train = alphabet['X'].shape[0] - N_valid_dpts
        train_data = dictslice(alphabet, slice(None, N_train))
        if report_train_loss:
            valid_data = dictslice(alphabet, slice(None, N_valid_dpts))
        else:
            valid_data = dictslice(alphabet, slice(N_train, None))
        def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True):
            RS = RandomState((seed, i_hyper, i_primal))
            idxs = RS.permutation(N_train)[:batch_size]
            minibatch = dictslice(train_data, idxs)
            loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty)
            if verbose and i_primal % 10 == 0: print "Iter {0}, loss, {1}".format(i_primal, getval(loss))
            return loss

        W0 = RS.randn(N_weights) * initialization_scale
        W_final = sgd(grad(primal_loss), hyperparam_vect, W0, alpha, beta, N_iters, callback=None)
        return reg_loss_fun(W_final, valid_data, hyperparam_vect, reg_penalty=False)
Beispiel #35
0
 def primal_loss(z_vect, transform_vect, i_primal, record_results):
     RS = RandomState((seed, i_hyper, i_primal, i_script))
     w_vect = transform_weights(z_vect, transform_vect)
     loss = total_loss(w_vect, train_data)
     reg = regularization(z_vect)
     if VERBOSE and record_results and i_primal % N_thin == 0:
         print "Iter {0}: train: {1}, valid: {2}, reg: {3}".format(
             i_primal,
             getval(loss) / N_scripts,
             total_loss(getval(w_vect), valid_data) / N_scripts,
             getval(reg))
     return loss + reg
Beispiel #36
0
    def hyperloss(transform_vect, i_hyper, record_results=True):
        RS = RandomState((seed, i_hyper, "hyperloss"))

        def primal_loss(z_vect, transform_vect, i_primal, record_results):
            RS = RandomState((seed, i_hyper, i_primal, i_script))
            w_vect = transform_weights(z_vect, transform_vect)
            loss = total_loss(w_vect, train_data)
            reg = regularization(z_vect)
            if VERBOSE and record_results and i_primal % N_thin == 0:
                print "Iter {0}: train: {1}, valid: {2}, reg: {3}".format(
                    i_primal, getval(loss) / N_scripts, total_loss(getval(w_vect), valid_data) / N_scripts, getval(reg)
                )
            return loss + reg

        z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_initialization_scale)
        z_vect_final = sgd(grad(primal_loss), transform_vect, z_vect_0, alpha, beta, N_iters, callback=None)
        w_vect_final = transform_weights(z_vect_final, transform_vect)
        valid_loss = total_loss(w_vect_final, valid_data)
        if record_results:
            results["valid_loss"].append(getval(valid_loss) / N_scripts)
            results["train_loss"].append(total_loss(w_vect_final, train_data) / N_scripts)
        return valid_loss
Beispiel #37
0
def show_alphabets(alphabets, ax=None, n_cols=20):
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    from nn_utils import plot_images
    seed = 1
    n_rows = len(alphabets)
    full_image = np.zeros((0, n_cols * 28))
    for alphabet in alphabets:
        RS = RandomState(seed)
        char_idxs = RS.randint(alphabet['X'].shape[0], size=n_cols)
        char_ids = np.argmax(alphabet['T'][char_idxs], axis=1)
        image = alphabet['X'][char_idxs].reshape((n_cols, 28, 28))
        image = np.transpose(image, axes=[1, 0, 2]).reshape((28, n_cols * 28))
        full_image = np.concatenate((full_image, image))
        
    if ax is None:
        fig = plt.figure()
        fig.set_size_inches((8, 8 * n_rows/n_cols))
        ax = fig.add_subplot(111)
    ax.imshow(full_image, cmap=mpl.cm.binary)
    ax.set_xticks(np.array([]))
    ax.set_yticks(np.array([]))
    plt.tight_layout()
    plt.savefig("all_alphabets.png")
def show_alphabets(alphabets, ax=None, n_cols=20):
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    from nn_utils import plot_images
    seed = 1
    n_rows = len(alphabets)
    full_image = np.zeros((0, n_cols * 28))
    for alphabet in alphabets:
        RS = RandomState(seed)
        char_idxs = RS.randint(alphabet['X'].shape[0], size=n_cols)
        char_ids = np.argmax(alphabet['T'][char_idxs], axis=1)
        image = alphabet['X'][char_idxs].reshape((n_cols, 28, 28))
        image = np.transpose(image, axes=[1, 0, 2]).reshape((28, n_cols * 28))
        full_image = np.concatenate((full_image, image))
        
    if ax is None:
        fig = plt.figure()
        fig.set_size_inches((8, 8 * n_rows/n_cols))
        ax = fig.add_subplot(111)
    ax.imshow(full_image, cmap=mpl.cm.binary)
    ax.set_xticks(np.array([]))
    ax.set_yticks(np.array([]))
    plt.tight_layout()
    plt.savefig("all_alphabets.png")
Beispiel #39
0
    def hyperloss(transform_vect, i_hyper, record_results=True):
        RS = RandomState((seed, i_hyper, "hyperloss"))
        def primal_loss(z_vect, transform_vect, i_primal, record_results=False):
            w_vect = transform_weights(z_vect, transform_vect)
            loss = total_loss(w_vect, train_data)
            reg = regularization(z_vect)
            if VERBOSE and record_results and i_primal % N_thin == 0:
                print "Iter {0}: train: {1}, valid: {2}, reg: {3}".format(
                    i_primal,
                    getval(loss) / N_scripts,
                    total_loss(getval(w_vect), valid_data) / N_scripts,
                    getval(reg))
            return loss + reg

        z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_initialization_scale)
        z_vect_final = sgd(grad(primal_loss), transform_vect, z_vect_0,
                           alpha, beta, N_iters, callback=None)
        w_vect_final = transform_weights(z_vect_final, transform_vect)
        valid_loss = total_loss(w_vect_final, valid_data)
        if record_results:
            results['valid_loss'].append(getval(valid_loss) / N_scripts) 
            results['train_loss'].append(total_loss(w_vect_final, train_data) / N_scripts)
            results['tests_loss'].append(total_loss(w_vect_final, tests_data) / N_scripts)
        return valid_loss
Beispiel #40
0
def plot():
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    mpl.rcParams['font.family'] = 'serif'

    with open('results.pkl') as f:
        transform_parser, transform_vects, train_losses, tests_losses = pickle.load(f)
    RS = RandomState((seed, "top_rs"))
    omniglot.show_alphabets(omniglot.load_flipped_alphabets(RS, normalize=False))

    # Plotting transformations
    names = ['no_sharing', 'full_sharing', 'learned_sharing']
    title_strings = {'no_sharing'      : 'Independent\nnets',
                     'full_sharing'    : 'Shared\nbottom layer',
                     'learned_sharing' : 'Learned\nsharing'}
    covar_imgs = {name : build_covar_image(transform_vects[name]) for name in names}

    prop={'family':'serif', 'size':'12'}

    fig = plt.figure(0)
    fig.clf()
    fig.set_size_inches((4,4))
    for i, name in enumerate(names):
        ax = fig.add_subplot(1, 3, i + 1)
        ax.matshow(covar_imgs[name], cmap = mpl.cm.binary)
        ax.set_title(title_strings[name])
        ax.set_xticks([])
        ax.set_yticks([])
        if i == 0:
            labels = ["Layer {0}".format(layer) for layer in [3, 2, 1]]            
            ypos   = [5, 15, 25]
            for s, y in zip(labels, ypos):
                ax.text(-3, y, s, rotation='vertical')
    plt.tight_layout()
    plt.savefig('learned_corr.png')
    plt.savefig('learned_corr.pdf')
Beispiel #41
0
def run():
    RS = RandomState((seed, "top_rs"))
    all_data = mnist.load_data_as_dict()
    train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests])
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size

    def transform_weights(z_vect, transform):
        return z_vect * np.exp(transform)

    def regularization(z_vect):
        return np.dot(z_vect, z_vect) * np.exp(log_L2)

    def constrain_transform(t_vect, name):
        all_t = w_parser.new_vect(t_vect)
        for i in range(N_layers):
            all_t[('biases', i)] = 0.0
        if name == 'universal':
            t_mean = np.mean([np.mean(all_t[('weights', i)])
                              for i in range(N_layers)])
            for i in range(N_layers):
                all_t[('weights', i)] = t_mean
        elif name == 'layers':
            for i in range(N_layers):
                all_t[('weights', i)] = np.mean(all_t[('weights', i)])
        elif name == 'units':
            for i in range(N_layers):
                all_t[('weights', i)] = np.mean(all_t[('weights', i)], axis=1, keepdims=True)
        else:
            raise Exception
        return all_t.vect

    def process_transform(t_vect):
        # Remove the redundancy due to sharing transformations within units
        all_t = w_parser.new_vect(t_vect)
        new_t = np.zeros((0,))
        for i in range(N_layers):
            layer = all_t[('weights', i)]
            assert np.all(layer[:, 0] == layer[:, 1])
            cur_t = log_L2 - 2 * layer[:, 0]
            new_t = np.concatenate((new_t, cur_t))
        return new_t

    def train_z(data, z_vect_0, transform):
        N_data = data['X'].shape[0]
        def primal_loss(z_vect, transform, i_primal, record_results=False):
            RS = RandomState((seed, i_primal, "primal"))
            idxs = RS.randint(N_data, size=batch_size)
            minibatch = dictslice(data, idxs)
            w_vect = transform_weights(z_vect, transform)
            loss = loss_fun(w_vect, **minibatch)
            reg = regularization(z_vect)
            if record_results and i_primal % N_thin == 0:
                print "Iter {0}: train: {1}".format(i_primal, getval(loss))
            return loss + reg
        return sgd(grad(primal_loss), transform, z_vect_0, alpha, beta, N_iters)

    all_transforms, all_tests_loss = [], []
    def train_reg(transform_0, constraint, N_meta_iter, i_top):
        def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform)
            w_vect_final = transform_weights(z_vect_final, transform)
            return loss_fun(w_vect_final, **cur_valid_data)
        hypergrad = grad(hyperloss)
        cur_transform = transform_0
        for i_hyper in range(N_meta_iter):
            if i_hyper % N_meta_thin == 0:
                tests_loss = hyperloss(cur_transform, i_hyper, train_data, tests_data)
                all_tests_loss.append(tests_loss)
                all_transforms.append(cur_transform.copy())
                print "Hyper iter {0}, test loss {1}".format(i_hyper, all_tests_loss[-1])
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid])
            raw_grad = hypergrad(cur_transform, i_hyper, *cur_split)
            constrained_grad = constrain_transform(raw_grad, constraint)
            cur_transform -= constrained_grad * meta_alpha
        return cur_transform

    transform = np.zeros(N_weights)
    constraints = ['universal', 'layers', 'units']
    for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)):
        print "Top level iter {0}".format(i_top)
        transform = train_reg(transform, constraint, N_meta_iter, i_top)

    all_L2_regs = np.array(zip(*map(process_transform, all_transforms)))
    return all_L2_regs, all_tests_loss
 def indexed_loss_fun(w, L2_vect, i_iter):
     rs = RandomState((seed, i_hyper, i_iter))  # Deterministic seed needed for backwards pass.
     idxs = rs.randint(N_train, size=batch_size)
     return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)
Beispiel #43
0
 def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data):
     RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
     z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
     z_vect_final = train_z(cur_train_data, z_vect_0, transform)
     w_vect_final = transform_weights(z_vect_final, transform)
     return loss_fun(w_vect_final, **cur_valid_data)
Beispiel #44
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    N_weights = len(parser.vect)
    hyperparams = VectorParser()
    rs = RandomState((seed))
    hyperparams['log_L2_reg'] = np.full(N_weights, init_log_L2_reg)\
                              + rs.randn(N_weights) * init_log_L2_reg_noise
    hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    hyperparams['log_alphas']      = np.full((N_iters, N_weight_types), init_log_alphas)
    hyperparams['invlogit_betas']  = np.full((N_iters, N_weight_types), init_invlogit_betas)

    cur_primal_results = {}

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState((seed, i_hyper, i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas  = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = np.exp(cur_hyperparams['log_L2_reg'])
        W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg),
                           parser, callback=callback)
        cur_primal_results['weights'] = getval(W_opt).copy()
        cur_primal_results['learning_curve'] = getval(learning_curve_dict)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **valid_data)
    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]
    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        #x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper)
        x, learning_curve_dict = cur_primal_results['weights'], cur_primal_results['learning_curve']
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field] = cur_hyperparams[field]
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        meta_results['example_weights'] = x
        if metagrad is not None:
            meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad))
            meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                                   / (np.linalg.norm(metagrad)*
                                                      np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['tests_loss'][-1], meta_results['test_err'][-1])

    initial_hypergrad = hyperloss_grad( hyperparams.vect, 0)
    parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy())
    final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha)
    meta_callback(final_result, N_meta_iter)
    parser.vect = None # No need to pickle zeros
    return meta_results, parser, parsed_init_hypergrad
Beispiel #45
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)

    rs = RandomState((seed))
    init_fake_data = rs.randn(*(train_data['X'].shape)) * init_fake_data_scale
    one_hot = lambda x, K : np.array(x[:,None] == np.arange(K)[None, :], dtype=int)
    fake_labels = one_hot(np.array(range(N_train)) % N_classes, N_classes)  # One of each.

    hyperparams = VectorParser()
    hyperparams['fake_data']  = init_fake_data
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    fixed_hyperparams['log_alphas']      = np.full((N_iters, N_weight_types), init_log_alphas)
    fixed_hyperparams['invlogit_betas']  = np.full((N_iters, N_weight_types), init_invlogit_betas)
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    cur_primal_results = {}

    loss_meta_parser = VectorParser()
    loss_meta_parser['']

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, meta_vect, i_iter):
            (train_data, train_labels, L2_vect) = meta
            return loss_fun(w, train_data, train_labels, L2_vect)
            #return loss_fun(w, train_data['X'], train_data['T'], L2_vect + np.sum(fake_data.ravel()))

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
        #        learning_curve_dict['learning_curve'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))


        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        fake_data = cur_hyperparams['fake_data']
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(fixed_hyperparams['log_alphas'])
        betas  = logit(fixed_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        meta = kylist(fake_data, fake_labels, L2_reg)
        W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, meta),
                           parser, callback=callback)
        cur_primal_results['weights'] = getval(W_opt).copy()
        cur_primal_results['learning_curve'] = getval(learning_curve_dict)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **valid_data)
    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]
    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        x, learning_curve_dict = cur_primal_results['weights'], cur_primal_results['learning_curve']
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        #meta_results['train_loss'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels))
        meta_results['train_loss'].append(0)
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        meta_results['example_weights'] = x
        if metagrad is not None:
            print metagrad
            meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad))
            meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                                   / (np.linalg.norm(metagrad)*
                                                      np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['tests_loss'][-1], meta_results['test_err'][-1])

    final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha)
    meta_callback(final_result, N_meta_iter)
    parser.vect = None # No need to pickle zeros
    return meta_results, parser
 def new_hyperloss(reg, i_hyper, cur_train_data, cur_valid_data):
     RS = RandomState((seed, i_hyper, "hyperloss"))
     w_vect_0 = RS.randn(N_weights) * init_scales
     w_vect_final = train_z(cur_train_data, w_vect_0, reg)
     return loss_fun(w_vect_final, **cur_valid_data)
Beispiel #47
0
 def loss_fun(alphabets, report_train_loss):
     RS = RandomState((seed, "evaluation")) # Same alphabet with i_hyper now
     return np.mean([hyperloss(hyperparam_vect, RS.int32(), alphabets=alphabets,
                               verbose=False, report_train_loss=report_train_loss)
                     for i in range(N_alphabets_eval)])
Beispiel #48
0
def run(script_corr):
    """Three different parsers:
    w_parser[('biases', i_layer)] : neural net weights/biases per layer for a single  script
    script_parser[i_script]       : weights vector for each script
    transform_parser[i_layer]     : transform matrix (scripts x scripts) for each alphabet"""
    RS = RandomState((seed, "top_rs"))
    train_data, valid_data, tests_data = omniglot.load_data_split(
        [11, 2, 2], RS, num_alphabets=N_scripts)
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size

    uncorrelated_mat = np.eye(N_scripts)
    fully_correlated_mat = np.full((N_scripts, N_scripts), 1.0 / N_scripts)
    transform_mat = (1 - script_corr
                     ) * uncorrelated_mat + script_corr * fully_correlated_mat
    transform_mat = transform_mat
    transform_parser = VectorParser()
    for i_layer in range(N_layers):
        if i_layer == N_layers - 1:
            transform_parser[i_layer] = uncorrelated_mat
        else:
            transform_parser[i_layer] = transform_mat

    script_parser = VectorParser()
    for i_script in range(N_scripts):
        script_parser[i_script] = np.zeros(N_weights)

    def transform_weights(all_z_vect, transform_vect, i_script_out):
        all_z = script_parser.new_vect(all_z_vect)
        transform = transform_parser.new_vect(transform_vect)
        W = OrderedDict(
        )  # Can't use parser because setting plain array ranges with funkyyak nodes not yet supported
        for k in w_parser.idxs_and_shapes.keys():
            W[k] = 0.0
        for i_layer in range(N_layers):
            script_weightings = transform[i_layer][i_script_out, :]
            for i_script in range(N_scripts):
                z_i_script = w_parser.new_vect(all_z[i_script])
                script_weighting = script_weightings[i_script]
                W[('biases',
                   i_layer)] += z_i_script[('biases',
                                            i_layer)] * script_weighting
                W[('weights',
                   i_layer)] += z_i_script[('weights',
                                            i_layer)] * script_weighting
        return np.concatenate([v.ravel() for v in W.values()])

    def loss_from_latents(z_vect, transform_vect, i_script, data):
        w_vect = transform_weights(z_vect, transform_vect, i_script)
        return loss_fun(w_vect, **data)

    def regularization(z_vect):
        return np.dot(z_vect, z_vect) * np.exp(log_L2_init)

    results = defaultdict(list)

    def hyperloss(transform_vect, i_hyper, record_results=False):
        def primal_stochastic_loss(z_vect, transform_vect, i_primal):
            RS = RandomState((seed, i_hyper, i_primal))
            loss = 0.0
            for _ in range(N_scripts_per_iter):
                i_script = RS.randint(N_scripts)
                N_train = train_data[i_script]['X'].shape[0]
                idxs = RS.permutation(N_train)[:batch_size]
                minibatch = dictslice(train_data[i_script], idxs)
                loss += loss_from_latents(z_vect, transform_vect, i_script,
                                          minibatch)
            reg = regularization(z_vect)
            if i_primal % 20 == 0:
                print "Iter {0}, loss {1}, reg {2}".format(
                    i_primal, getval(loss), getval(reg))
                print "Full losses: train: {0}, valid: {1}".format(
                    total_loss(train_data, getval(z_vect)),
                    total_loss(valid_data, getval(z_vect)))
            return loss + reg

        def total_loss(data, z_vect):
            return np.mean([
                loss_from_latents(z_vect, transform_vect, i_script,
                                  data[i_script])
                for i_script in range(N_scripts)
            ])

        z_vect_0 = RS.randn(
            script_parser.vect.size) * np.exp(log_initialization_scale)
        z_vect_final = sgd(grad(primal_stochastic_loss),
                           transform_vect,
                           z_vect_0,
                           alpha,
                           beta,
                           N_iters,
                           callback=None)
        valid_loss = total_loss(valid_data, z_vect_final)
        if record_results:
            results['valid_loss'].append(valid_loss)
            results['train_loss'].append(total_loss(train_data, z_vect_final))
            # results['tests_loss'].append(total_loss(tests_data, z_vect_final))
        return valid_loss

    hyperloss(transform_parser.vect, 0, record_results=True)
    return results['train_loss'][-1], results['valid_loss'][-1]
 def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data):
     RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
     w_vect_0 = RS.randn(N_weights) * init_scales
     w_vect_final = train_z(loss_fun, cur_train_data, w_vect_0, reg)
     # fraction_error = frac_err(w_vect_final,**cur_valid_data)
     return loss_fun(w_vect_final, **cur_valid_data)