Ejemplo n.º 1
0
def _run_experiment_args(self, results_file, data_and_splits, method_results, i_labels, split):
    num_labels = self.configs.num_labels[i_labels]
    s = str(num_labels) + '-' + str(split)
    curr_results = _load_temp_split_file(results_file, num_labels, split)
    if curr_results:
        return curr_results
    if mpi_utility.is_master():
        timer.tic()
    #print 'num_labels-split: ' + s
    temp_file_name = _temp_split_file_name(results_file, num_labels, split)
    temp_dir_root = helper_functions.remove_suffix(temp_file_name, '.pkl')
    temp_dir = temp_dir_root + '/CV-temp/'
    curr_data = data_and_splits.get_split(split, num_labels)
    learner = self.configs.learner
    curr_learner = copy.deepcopy(learner)
    curr_learner.split_idx_str = s
    curr_learner.temp_dir = temp_dir
    curr_results = curr_learner.train_and_test(curr_data)
    if mpi_utility.is_group_master():
        helper_functions.save_object(_temp_split_file_name(results_file,num_labels,split),curr_results)
        helper_functions.delete_dir_if_exists(temp_dir_root)
    instance_subset = learner.configs.instance_subset
    results_features = learner.configs.results_features
    test_error_to_print = 'is_train'
    if mpi_utility.is_group_master():
        if hasattr(curr_learner, 'best_params'):
            print s + '-' + str(curr_learner.best_params) + ' Error: ' + \
                  str(curr_results.compute_error(self.configs.loss_function, results_features, test_error_to_print))
        else:
            print s + ' Done'
    if mpi_utility.is_master():
        timer.toc()
    return curr_results
Ejemplo n.º 2
0
def mult_test(*args):
    print 'mult'
    size = (2000, 2000)
    X = np.random.uniform(-1, 1, size)
    tic()
    XX = X.T.dot(X)
    toc()
Ejemplo n.º 3
0
def nystrom_woodbury_laplacian(X,
                               lamb,
                               perc_columns,
                               W=None,
                               C=None,
                               D=None,
                               v=None):
    lamb = float(lamb)
    timing_test = False
    if timing_test:
        tic()
    if W is None or C is None:
        W, C = nystrom(X, perc_columns)
    #W_inv = np.linalg.pinv(W)
    #X_n = X.shape[0]
    d = X.sum(1)
    dl_inv = 1 / (d + lamb)

    inv_approx = None
    vProd = None
    fast_solver = True
    if fast_solver:
        CTA = C.T * dl_inv
        B_inv = np.linalg.pinv(-W + CTA.dot(C))
        if v is not None:
            assert False, 'Make sure this works'
            v1 = CTA.dot(v)
            v2 = B_inv.dot(v1)
            v3 = -C.dot(v2)
            v4 = v3 + v
            v5 = dl_inv * v4
            vProd = v5
        else:
            T = -C.dot(B_inv).dot(CTA)
            T[np.diag_indices_from(T)] += 1
            inv_approx = dl_inv[:, None] * T
        '''
        vProd = inv_approx.dot(v)
        err = norm(vProd - v5) / norm(vProd)
        print str(err)
        print ''
        '''
    else:
        A_inv = np.diag(1 / (d + lamb))
        CTA = C.T.dot(A_inv)
        B_inv = np.linalg.pinv(-W + CTA.dot(C))
        inv_approx = A_inv - A_inv.dot(C).dot(B_inv).dot(CTA)
    #inv_approx = A_inv.dot(np.eye(A_inv.shape[0]) - C.dot(B_inv).dot(CTA))
    '''

    '''
    #print 'optimized approx error: ' + str(norm(inv_approx-inv_approx2))
    if timing_test:
        toc()
        tic()
        inv_actual = np.linalg.inv(lamb * np.eye(X.shape[0]) + np.diag(d) - X)
        print 'Nystrom-Woodbery error: ' + str(
            norm(inv_approx - inv_actual) / norm(inv_actual))
        toc()
    return inv_approx, vProd
Ejemplo n.º 4
0
def mult_test(*args):
    print 'mult'
    size = (2000, 2000)
    X = np.random.uniform(-1, 1, size)
    tic()
    XX = X.T.dot(X)
    toc()
Ejemplo n.º 5
0
def normal_test(*args):
    n = 5000
    p = 2000
    X = np.random.uniform(-1, 1, (n, p))
    C = 1e-3
    y = np.random.uniform(-1, 1, n)
    tic()
    A = X.T.dot(X) + C * np.eye(p)
    k = X.T.dot(y)
    w = np.linalg.solve(A, k)
    toc()
Ejemplo n.º 6
0
def normal_test(*args):
    n = 5000
    p = 2000
    X = np.random.uniform(-1, 1, (n, p))
    C = 1e-3
    y = np.random.uniform(-1, 1, n)
    tic()
    A = X.T.dot(X) + C*np.eye(p)
    k = X.T.dot(y)
    w = np.linalg.solve(A, k)
    toc()
Ejemplo n.º 7
0
def cvx_test(*args):
    n = 5000
    p = 100
    X = np.random.uniform(-1, 1, (n, p))
    C = 1e-3
    y = np.random.uniform(-1, 1, n)
    w = cvx.Variable(p)
    loss = cvx.sum_entries(cvx.square(X * w - y))
    reg = cvx.norm2(w)**2
    obj = cvx.Minimize(loss + C * reg)
    prob = cvx.Problem(obj, [])
    tic()
    prob.solve(solver=cvx.SCS, verbose=False)
    toc()
Ejemplo n.º 8
0
def cvx_test(*args):
    n = 5000
    p = 100
    X = np.random.uniform(-1,1,(n,p))
    C = 1e-3
    y = np.random.uniform(-1,1, n)
    w = cvx.Variable(p)
    loss = cvx.sum_entries(cvx.square(X*w - y))
    reg = cvx.norm2(w)**2
    obj = cvx.Minimize(loss + C*reg)
    prob = cvx.Problem(obj, [])
    tic()
    prob.solve(solver=cvx.SCS, verbose=False)
    toc()
Ejemplo n.º 9
0
def run_main(num_labels=None, split_idx=None, no_viz=None, configs=None, comm=None):
    import argparse
    import sys
    #print sys.argv
    parser = argparse.ArgumentParser()
    parser.add_argument('-num_labels', type=int)
    parser.add_argument('-split_idx', type=int)
    parser.add_argument('-no_viz', action='store_true')
    arguments = parser.parse_args(sys.argv[1:])
    if num_labels is not None:
        arguments.num_labels = num_labels
    if split_idx is not None:
        arguments.split_idx = split_idx
    if no_viz is not None:
        arguments.no_viz = no_viz

    configs_lib.comm = comm
    if test_mpi:
        from mpi4py import MPI
        print str(MPI.COMM_WORLD.Get_rank()) + '-' + str(arguments.num_labels) + '-' + str(arguments.split_idx)
        return

    configs_lib.arguments = arguments
    import warnings
    #print 'Ignoring Deprecation Warnings'
    warnings.filterwarnings("ignore",category=DeprecationWarning)

    from mpi4py import MPI
    comm = MPI.COMM_WORLD
    if MPI.COMM_WORLD.Get_size() > 1:
        if mpi_utility.is_group_master():
            print '(' + socket.gethostname() + ')''Process ' + str(comm.Get_rank()) + ': Starting experiments...'
    else:
        print 'Starting experiments...'
    if mpi_utility.is_group_master():
        timer.tic()
    if configs_lib.run_experiments:
        run_experiments(configs)
    if mpi_utility.is_group_master():
        timer.toc()
    if helper_functions.is_laptop():
        import winsound
        winsound.Beep(440, 1000)
    if helper_functions.is_laptop() and not arguments.no_viz and MPI.COMM_WORLD.Get_size() == 1:
        vis_configs = configs_lib.VisualizationConfigs()
        if vis_configs.vis_table:
            create_table()
        else:
            run_visualization()
Ejemplo n.º 10
0
    tic()
    A = X.T.dot(X) + C * np.eye(p)
    k = X.T.dot(y)
    w = np.linalg.solve(A, k)
    toc()


def cvx_test(*args):
    n = 5000
    p = 100
    X = np.random.uniform(-1, 1, (n, p))
    C = 1e-3
    y = np.random.uniform(-1, 1, n)
    w = cvx.Variable(p)
    loss = cvx.sum_entries(cvx.square(X * w - y))
    reg = cvx.norm2(w)**2
    obj = cvx.Minimize(loss + C * reg)
    prob = cvx.Problem(obj, [])
    tic()
    prob.solve(solver=cvx.SCS, verbose=False)
    toc()


if __name__ == '__main__':
    comm = MPI.COMM_WORLD
    is_master = comm.Get_rank() == 0
    if is_master:
        tic()
    run_test()
    if is_master:
        toc()
Ejemplo n.º 11
0
    def predict(self, data):
        # d = data_lib.Data(np.expand_dims(data.source_y_pred, 1), data.y)
        y_pred_source = data.source_y_pred
        I = np.arange(y_pred_source.size)
        if self.predict_sample is not None and self.predict_sample < y_pred_source.size:
            I = np.random.choice(y_pred_source.size,
                                 self.predict_sample,
                                 replace=False)
        if self.use_rbf:
            #L = array_functions.make_laplacian(y_pred_source[I], self.sigma_tr)
            W_source_pred = array_functions.make_rbf(y_pred_source[I],
                                                     self.sigma_tr)
            if self.oracle_guidance is not None:
                y = data.true_y[I]

                n_y = y.size
                num_to_sample = math.ceil(self.oracle_guidance * n_y**2)
                rand_index1 = np.random.choice(n_y,
                                               int(num_to_sample),
                                               replace=True)
                rand_index2 = np.random.choice(n_y,
                                               int(num_to_sample),
                                               replace=True)
                if self.oracle_guidance_binary:
                    target_distances = array_functions.make_graph_distance(y)
                    distance_threshold = .2 * (y.max() - y.min())
                    W_source_pred[rand_index1, rand_index2] = target_distances[
                        rand_index1, rand_index2] <= distance_threshold
                    W_source_pred[rand_index2, rand_index1] = target_distances[
                        rand_index2, rand_index1] <= distance_threshold
                else:
                    y_scaled = array_functions.normalize(y) * (
                        y_pred_source.max() - y_pred_source.min())
                    W_oracle_pred = array_functions.make_rbf(
                        y_scaled, self.sigma_tr)
                    W_source_pred[rand_index1,
                                  rand_index2] = W_oracle_pred[rand_index1,
                                                               rand_index2]
                    W_source_pred[rand_index2,
                                  rand_index1] = W_oracle_pred[rand_index2,
                                                               rand_index1]
            W = array_functions.make_rbf(self.transform.transform(self.x),
                                         self.sigma_nw,
                                         x2=self.transform.transform(
                                             data.x[I, :])).T

        else:
            assert self.oracle_guidance is None
            k_L = int(self.sigma_tr * I.size)
            #L = array_functions.make_laplacian_kNN(y_pred_source[I], k_L)
            W_source_pred = array_functions.make_knn(y_pred_source[I], k_L)
            k_W = int(self.sigma_nw * self.x.shape[0])
            W = array_functions.make_knn(self.transform.transform(
                data.x[I, :]),
                                         k_W,
                                         x2=self.transform.transform(self.x))
        sparsify_prediction_graph = False
        if self.use_prediction_graph_radius:
            sparsify_prediction_graph = True
            W_sparse = array_functions.make_graph_radius(
                self.transform.transform(data.x[I, :]),
                radius=self.radius,
            )
        if self.use_prediction_graph_sparsification:
            sparsify_prediction_graph = True
            W_sparse = array_functions.make_knn(self.transform.transform(
                data.x[I, :]),
                                                self.k_sparsification,
                                                normalize_entries=False)
            #W_L = array_functions.make_knn(y_pred_source[I], k_L)
        if sparsify_prediction_graph:
            W_source_pred = W_source_pred * W_sparse
        S = array_functions.make_smoothing_matrix(W)
        timing_test = False
        C = self.C * self.x.shape[0] / W_source_pred[:].sum()
        if self.nystrom_percentage > 0 or timing_test:
            if timing_test:
                tic()
            Sy = S.dot(self.y)
            if C != 0:
                lamb = 1 / float(C)
                f = None
                tic()
                inv_approx, _ = array_functions.nystrom_woodbury_laplacian(
                    W_source_pred, lamb, self.nystrom_percentage)
                self.predict_time = toc()
                #_, f2 = array_functions.nystrom_woodbury_laplacian(W_source_pred, lamb, self.nystrom_percentage, v=Sy)
                if f is not None:
                    f *= lamb
                else:
                    inv_approx *= lamb
                    f = inv_approx.dot(Sy)
            else:
                f = Sy
            if timing_test:
                toc()
        if self.nystrom_percentage == 0 or self.nystrom_percentage is None or timing_test:
            if timing_test:
                tic()
            L = array_functions.make_laplacian_with_W(W_source_pred,
                                                      normalized=False)
            A = np.eye(I.size) + C * L
            try:
                tic()
                f = np.linalg.lstsq(A, S.dot(self.y))[0]
                self.predict_time = toc()
            except:
                print 'GraphTransferNW:predict failed, returning mean'
                f = self.y.mean() * np.ones(data.true_y.shape)
            if timing_test:
                toc()
        if timing_test:
            A_inv = np.linalg.inv(A)
            print 'approx error: ' + str(
                norm(inv_approx - A_inv) / norm(A_inv))
        o = results.Output(data)
        if self.predict_sample is not None:
            nw_data = data_lib.Data(data.x[I, :], f)
            self.nw_learner.train_and_test(nw_data)
            nw_output = self.nw_learner.predict(data)
            o.y = nw_output.y
            o.fu = nw_output.y
        else:
            o.y = f
            o.fu = f

        return o
Ejemplo n.º 12
0
def mpi_run_main_args(args):
    #return None
    args = list(args)
    if len(mpi_comms) > 1 or parallelize_cv:
        my_comm = mpi_comms[helper_functions.get_hostname()]
        args.append(my_comm)
    main.run_main_args(args)


def results_exist(configs):
    results_file = configs.results_file
    return os.path.isfile(results_file)


if __name__ == '__main__':
    timer.tic()
    pc = configs_lib.create_project_configs()
    #num_labels_list = num_labels_list[0:10]
    if use_mpi:
        mpi_rollcall()
        '''
        from mpipool import core as mpipool
        pool = mpipool.MPIPool(debug=debug_mpi_pool, loadbalance=True)        
        pool.map(main.run_main_args, num_labels_list)
        #pool.map(launch_subprocess_args, num_labels_list)
        pool.close()
        '''
        mpi_comms = mpi_utility.mpi_comm_by_node(include_root=False)
        if len(mpi_comms) > 1 or parallelize_cv:
            pool = mpi_group_pool.MPIGroupPool(debug=False,
                                               loadbalance=True,
Ejemplo n.º 13
0
    def create_sampling_distribution(self, base_learner, data, fold_results):
        cluster_scale = self.cluster_scale
        source_learner = deepcopy(self.base_learner)
        source_data = data.get_transfer_subset(self.configs.source_labels)
        if source_data.n > 1000:
            source_data = source_data.rand_sample(.2)
            print 'subsampling source data: ' + str(source_data.n)
        if source_data.is_regression:
            source_data.data_set_ids[:] = self.configs.target_labels[0]
        else:
            source_data.change_labels(self.configs.source_labels,
                                      self.configs.target_labels)
        tic()
        source_learner.train_and_test(source_data)
        print 'train source time: ' + toc_str()
        target_data = data.get_transfer_subset(self.configs.target_labels,
                                               include_unlabeled=True)
        y_pred = source_learner.predict(data).y
        if self.use_oracle_target:
            target_learner = deepcopy(self.base_learner)
            oracle_target_data = deepcopy(target_data)
            oracle_target_data.y = oracle_target_data.true_y
            oracle_target_data.is_train[:] = True
            target_learner.train_and_test(oracle_target_data)
            y_pred_target = target_learner.predict(data).y
            y_pred = y_pred_target
        if self.use_oracle_labels:
            y_pred = data.true_y.copy()

        n_items = self.configs.active_items_per_iteration
        I = data.is_train
        if not self.use_warm_start:
            I &= ~data.is_labeled
        if self.configs.target_labels is not None:
            I &= data.get_transfer_inds(self.configs.target_labels)
        I = I.nonzero()[0]
        if self.max_items_for_instance_selection is not None and \
                        I.size > self.max_items_for_instance_selection:
            I = np.random.choice(I,
                                 self.max_items_for_instance_selection,
                                 replace=False)
            print 'subsampling target data: ' + str(I.size)

        labeled_target_data = deepcopy(data.get_subset(I))
        instances_to_keep = labeled_target_data.is_labeled
        labeled_target_data.set_train()
        labeled_target_data.is_noisy = array_functions.false(
            labeled_target_data.n)

        labeled_target_data.y = y_pred[I].copy()
        labeled_target_data.true_y = y_pred[I].copy()
        labeled_target_data.y_orig = y_pred[I].copy()
        labeled_target_data.instances_to_keep = instances_to_keep

        #labeled_target_data.y_orig = labeled_target_data.true_y.copy()
        if self.use_instance_selection:
            self.instance_selector.subset_size = n_items
            self.instance_selector.num_samples = n_items
            self.instance_selector.configs.use_validation = False
            self.instance_selector.configs.use_training = True
            self.instance_selector.train_and_test(labeled_target_data)
            is_selected = self.instance_selector.predict(
                labeled_target_data).is_selected
            scores = np.ones(is_selected.size)
            #Lower score is better
            scores[is_selected] = 0
            scores_sorted_inds = np.argsort(scores)
            print ''
        elif self.use_density:
            target_learner = deepcopy(self.base_learner)
            target_learner.train_and_test(labeled_target_data)
            vars = self.estimate_variance(
                target_learner,
                labeled_target_data,
            )
            densities = self.estimate_density(labeled_target_data)
        else:
            X_sub = data.x[I, :]
            tic()
            X_cluster_space, cluster_ids = self.create_clustering(
                X_sub,
                int(cluster_scale * self.configs.active_items_per_iteration))
            print 'cluster target time: ' + toc_str()
            vars, cluster_n = self.get_cluster_purity(
                cluster_ids, y_pred[I], not target_data.is_regression)
            true_vars, true_cluster_n = self.get_cluster_purity(
                cluster_ids, data.true_y[I], not target_data.is_regression)
            if self.use_target_variance:
                vars = true_vars
            centroid_idx = self.get_cluster_centroids(X_cluster_space)
            densities = cluster_n
        if self.use_instance_selection:
            pass
        else:
            scores = vars / densities
            scores_sorted_inds = np.argsort(scores)

        # Don't sample instances if cluster size is 1
        if not self.use_density and not self.use_instance_selection:
            scores[cluster_n <= .005 * I.size] = np.inf
            to_use = centroid_idx[scores_sorted_inds[:n_items]]
        else:
            to_use = scores_sorted_inds[:n_items]
        if self.transfer_hyperparameters:
            target_learner = deepcopy(self.base_learner)
            target_learner.configs.use_validation = True
            labeled_target_data.y[~is_selected] = np.nan

            target_learner.train_and_test(labeled_target_data)
            self.base_learner.base_learner.cv_params = {'unused': [0]}
            self.base_learner.base_learner.best_params = target_learner.base_learner.best_params
            self.base_learner.base_learner.set_params(
                **target_learner.base_learner.best_params)
        d = np.zeros(data.y.shape)
        d[I[to_use]] = 1
        d = d / d.sum()
        return d, d.size
Ejemplo n.º 14
0
from timer import timer
import sys
timer.tic()
x = int(sys.argv[2])
for i in range(int(sys.argv[2]),int(sys.argv[3])):
    x += 1
#print x
s = 'count_test_output_' + str(sys.argv[1])
#print s
#timer.toc()
#print 'sys.argv:' + str(sys.argv)
Ejemplo n.º 15
0
    tic()
    A = X.T.dot(X) + C*np.eye(p)
    k = X.T.dot(y)
    w = np.linalg.solve(A, k)
    toc()

def cvx_test(*args):
    n = 5000
    p = 100
    X = np.random.uniform(-1,1,(n,p))
    C = 1e-3
    y = np.random.uniform(-1,1, n)
    w = cvx.Variable(p)
    loss = cvx.sum_entries(cvx.square(X*w - y))
    reg = cvx.norm2(w)**2
    obj = cvx.Minimize(loss + C*reg)
    prob = cvx.Problem(obj, [])
    tic()
    prob.solve(solver=cvx.SCS, verbose=False)
    toc()



if __name__ == '__main__':
    comm = MPI.COMM_WORLD
    is_master = comm.Get_rank() == 0
    if is_master:
        tic()
    run_test()
    if is_master:
        toc()