def split_data(file, configs):
    data = helper_functions.load_object(file)
    data.is_regression = configs.is_regression
    splitter = DataSplitter()
    splitData = data_lib.SplitData()
    splitData.data = data
    num_splits = 30
    perc_train = .8
    keep_for_splitting = None
    if configs.split_data_set_ids is not None:
        keep_for_splitting = array_functions.false(data.n)
        keep_for_splitting[data.data_set_ids == 0] = True
    #Pretend data_set_ids is a label vector to ensure each data set is split equally
    if data.is_regression and data.data_set_ids is not None:
        assert len(data.data_set_ids) == data.n
        is_regression = False
        splitData.splits = splitter.generate_splits(data.data_set_ids,
                                                    num_splits, perc_train,
                                                    is_regression,
                                                    keep_for_splitting)
    else:
        splitData.splits = splitter.generate_splits(data.y, num_splits,
                                                    perc_train,
                                                    data.is_regression,
                                                    keep_for_splitting)
    splitData.data_set_ids_to_keep = configs.data_set_ids_to_keep
    split_dir = os.path.dirname(file)
    save_file = split_dir + '/split_data.pkl'
    helper_functions.save_object(save_file, splitData)
    return splitData
Esempio n. 2
0
def test_mnist():
    num_per_class = 50
    data = helper_functions.load_object('../data_sets/mnist/raw_data.pkl')
    classes_to_use = [0, 4, 8, 7]
    I = array_functions.find_set(data.y, classes_to_use)
    data = data.get_subset(I)
    to_keep = None
    for i in classes_to_use:
        inds = (data.y == i).nonzero()[0]
        I = np.random.choice(inds, size=num_per_class, replace=False)
        if to_keep is None:
            to_keep = I
        else:
            to_keep = np.concatenate((to_keep, I))
    data.change_labels([classes_to_use[1], classes_to_use[3]], [classes_to_use[0], classes_to_use[2]])
    data.change_labels([classes_to_use[0], classes_to_use[2]], [0, 1])
    data_test = data.get_subset(~to_keep)
    data = data.get_subset(to_keep)
    label_names = [
        str(classes_to_use[0]) + '+' + str(classes_to_use[1]),
        str(classes_to_use[2]) + '+' + str(classes_to_use[3]),
    ]

    #data = add_label_noise_cluster(data, num_neighbors=20)
    #data = add_label_noise(data, 20)
    test_methods(data.x, data.y, data_test.x, data_test.y, label_names, mnist=True)
Esempio n. 3
0
def _load_temp_experiment_file(final_file_name, num_labels):
    experiment_temp_file = _temp_experiment_file_name(final_file_name, num_labels)
    if not os.path.isfile(experiment_temp_file):
        return None
    if mpi_utility.is_master():
        print 'found ' + experiment_temp_file + ' - loading'
    return helper_functions.load_object(experiment_temp_file)
Esempio n. 4
0
def _load_temp_split_file(final_file_name, num_labels, split):
    split_temp_file = _temp_split_file_name(final_file_name, num_labels, split)
    if not os.path.isfile(split_temp_file):
        return None
    if mpi_utility.is_master():
        print 'found ' + split_temp_file + ' - loading'
    return helper_functions.load_object(split_temp_file)
Esempio n. 5
0
def create_kc_housing():
    file = 'kc_housing/processed_data.pkl'
    x, y = helper_functions.load_object(file)
    data = data_class.Data(x, y)
    data.is_regression = True
    s = kc_housing_file
    helper_functions.save_object(s, data)
Esempio n. 6
0
def viz(pc, fig=None, show_histogram=False, show=True):
    import create_data_set
    from methods import method
    source_learner = method.NadarayaWatsonMethod()
    target_learner = method.NadarayaWatsonMethod()
    #pc = configs_lib.ProjectConfigs()
    data = helper_functions.load_object('../' + pc.data_file).data
    data.set_train()
    source_data = data.get_transfer_subset(pc.source_labels)
    source_data.set_target()
    target_data= data.get_transfer_subset(pc.target_labels)
    target_data.set_target()
    source_learner.train_and_test(source_data)
    target_learner.train_and_test(target_data)
    source_learner.sigma = 10
    target_learner.sigma = 10
    x = array_functions.vec_to_2d(np.linspace(data.x.min(), data.x.max(), 100))
    test_data = data_lib.Data()
    test_data.x = x
    test_data.is_regression = True
    y_s = source_learner.predict(test_data).fu
    y_t = target_learner.predict(test_data).fu

    #array_functions.plot_line(x,y_t-y_s,pc.data_set,y_axes=np.asarray([-5,5]))
    y = y_t-y_s
    #y = y - y.mean()
    array_functions.plot_line(x,y, title=None ,fig=fig,show=show)
    if show_histogram:
        array_functions.plot_histogram(data.x,20)
    x=1
def split_data(file, configs):
    data = helper_functions.load_object(file)
    splitter = DataSplitter()
    splitData = data_lib.SplitData()
    splitData.data = data
    num_splits = 30
    perc_train = .8
    keep_for_splitting = None
    if configs.split_data_set_ids is not None:
        keep_for_splitting = array_functions.false(data.n)
        keep_for_splitting[data.data_set_ids == 0] = True
    #Pretend data_set_ids is a label vector to ensure each data set is split equally
    if data.is_regression and data.data_set_ids is not None:
        assert len(data.data_set_ids) == data.n
        is_regression = False
        splitData.splits = splitter.generate_splits(
            data.data_set_ids,
            num_splits,
            perc_train,
            is_regression,
            keep_for_splitting
        )
    else:
        splitData.splits = splitter.generate_splits(
            data.y,
            num_splits,
            perc_train,
            data.is_regression,
            keep_for_splitting
        )
    splitData.data_set_ids_to_keep = configs.data_set_ids_to_keep
    split_dir = os.path.dirname(file)
    save_file = split_dir + '/split_data.pkl'
    helper_functions.save_object(save_file,splitData)
    return splitData
Esempio n. 8
0
def create_pollution(
    labels_to_use=np.arange(2), series_to_use=0, num_instances=None, normalize_xy=True, save_data=True
):
    file = "pollution/processed_data.pkl"
    y, ids = helper_functions.load_object(file)
    y_to_use = y[:, series_to_use, :]
    print str(series_to_use) + ": " + ids[series_to_use]
    data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[series_to_use]]))
    data.is_regression = True
    data.keep_series(labels_to_use)
    data = data.get_min_range()
    data.smooth_missing()
    data.x = data.x.astype(np.float)
    if num_instances is not None:
        data = data.get_range([0, num_instances])
    if normalize_xy:
        data.reset_x()
        data.normalize_y()

    data = data.create_data_instance()
    # perc_used = data.get_perc_used()
    if num_instances is not None:
        pass
        s = "pollution-%d-%d" % (series_to_use, num_instances)
    else:
        s = "pollution-%d" % series_to_use
    if normalize_xy:
        s += "-norm"
    s += "/raw_data.pkl"
    # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10)
    array_functions.plot_2d_sub(data.x, data.y, data_set_ids=data.data_set_ids, title=None, sizes=10)
    if save_data:
        helper_functions.save_object(s, data)
def _load_temp_experiment_file(final_file_name, num_labels):
    experiment_temp_file = _temp_experiment_file_name(final_file_name, num_labels)
    if not os.path.isfile(experiment_temp_file):
        return None
    if mpi_utility.is_master():
        print 'found ' + experiment_temp_file + ' - loading'
    return helper_functions.load_object(experiment_temp_file)
Esempio n. 10
0
def run_main():
    #data_dir = 'data_sets/concrete'
    data_dir = 'data_sets/boston_housing'
    #data_dir = 'data_sets/kc_housing'
    #data_dir = 'data_sets/synthetic_linear_reg500-50-1.01'
    #data_dir = 'data_sets/drosophilia'
    data_dir = 'data_sets/synthetic_linear_reg500-10-1.01'
    data_file = data_dir + '/split_data.pkl'
    data = helper_functions.load_object(data_file).data
    data.set_target()
    data.set_train()
    data.set_true_y()
    #data.x = array_functions.select_k_features(data.x, data.y, 50)
    estimator.train_and_test(data)
    w_normalized = estimator.w / norm(estimator.w)
    w_normalized = np.expand_dims(w_normalized, 1)
    print w_normalized
    p = estimator.w.size
    corr = np.zeros((p,1))
    for i in range(p):
        xi = data.x[:,i]
        y = data.true_y
        corr[i] = pearsonr(xi, y)[0]
    print corr
    m = np.concatenate((w_normalized, corr), 1)
    print m
def _load_temp_split_file(final_file_name, num_labels, split):
    split_temp_file = _temp_split_file_name(final_file_name, num_labels, split)
    if not os.path.isfile(split_temp_file):
        return None
    if mpi_utility.is_master():
        print 'found ' + split_temp_file + ' - loading'
    return helper_functions.load_object(split_temp_file)
Esempio n. 12
0
def create_kc_housing():
    file = "kc_housing/processed_data.pkl"
    x, y = helper_functions.load_object(file)
    data = data_class.Data(x, y)
    data.is_regression = True
    s = kc_housing_file
    helper_functions.save_object(s, data)
Esempio n. 13
0
    def load_data_and_splits(self, data_file):

        data_and_splits = helper_functions.load_object(data_file)
        data_and_splits.data.repair_data()
        assert self.configs.num_splits <= len(data_and_splits.splits)
        data_and_splits.labels_to_keep = self.configs.labels_to_keep
        data_and_splits.labels_to_not_sample = self.configs.labels_to_not_sample
        data_and_splits.target_labels = self.configs.target_labels
        data_and_splits.data.repair_data()
        return data_and_splits
Esempio n. 14
0
def create_time_series(label_to_use=0,
                       series_to_use=0,
                       num_instances=None,
                       normalize_x=False,
                       save_data=True,
                       name='CO2_emissions'):
    file = name + '/processed_data.pkl'
    all_data = []
    for i in series_to_use:
        y, ids = helper_functions.load_object(file)
        y_to_use = y[:, i, :]
        print str(i) + ': ' + ids[i]
        data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[i]]))
        data.is_regression = True
        data.keep_series(label_to_use)
        data = data.get_min_range()
        data.smooth_missing()
        data = data.get_nth(7)
        data.reset_x()
        data.x = data.x.astype(np.float)
        if num_instances is not None:
            data = data.get_range([0, num_instances])
        data = data.get_range([1000, 1500])
        if normalize_x:
            data.x -= data.x.min()
            data.x /= data.x.max()
        data = data.create_data_instance()
        try:
            if len(series_to_use) > 1:
                data.data_set_ids[:] = i
        except:
            pass
        all_data.append(data)
        # perc_used = data.get_perc_used()
    data = all_data[0]
    del all_data[0]
    for di in all_data:
        data.combine(di)
    if num_instances is not None:
        pass
        s = name + '-%s-%d' % (str(series_to_use), num_instances)
    else:
        s = name + '-%s' % str(series_to_use)
    if normalize_x:
        s += '-norm'
    s += '/raw_data.pkl'
    # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10)
    array_functions.plot_2d_sub(data.x,
                                data.y,
                                data_set_ids=data.data_set_ids,
                                title=None,
                                sizes=10)
    if save_data:
        helper_functions.save_object(s, data)
Esempio n. 15
0
def create_pollution(labels_to_use=np.arange(2),
                     series_to_use=[0],
                     num_instances=None,
                     normalize_xy=True,
                     save_data=True):
    #series_to_use = 1
    file = 'pollution/processed_data.pkl'
    y, ids = helper_functions.load_object(file)
    data = None
    label_names = []
    for i in range(y.shape[1]):
        print str(i) + '-' + ids[i] + ': ' + str(y[0, i, :])
    for idx, s in enumerate(series_to_use):
        for label in labels_to_use:
            label_names.append(str(label) + '-' + ids[s])
        y_to_use = y[:, s, :]
        print str(s) + ': ' + ids[s]
        time_series_data = data_class.TimeSeriesData(y_to_use,
                                                     np.asarray([ids[s]]))
        time_series_data.is_regression = True
        time_series_data.keep_series(labels_to_use)
        time_series_data = time_series_data.get_min_range()
        time_series_data.smooth_missing()
        time_series_data.x = time_series_data.x.astype(np.float)
        if num_instances is not None:
            time_series_data = time_series_data.get_range([0, num_instances])

        if normalize_xy:
            time_series_data.reset_x()
            #time_series_data.normalize_y()

        curr_data = time_series_data.create_data_instance()
        curr_data.data_set_ids += idx * labels_to_use.size
        if data is None:
            data = curr_data
        else:
            data.combine(curr_data)
    data.label_names = label_names
    #perc_used = data.get_perc_used()
    if num_instances is not None:
        s = 'pollution-%s-%s' % (str(series_to_use), str(num_instances))
    else:
        s = 'pollution-%d' % series_to_use
    if normalize_xy:
        s += '-norm'
    s += '/raw_data.pkl'
    #array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10)
    array_functions.plot_2d_sub(data.x,
                                data.y,
                                data_set_ids=data.data_set_ids,
                                title=None,
                                sizes=10)
    if save_data:
        helper_functions.save_object(s, data)
Esempio n. 16
0
def run_main(results_file):
    results = helper_functions.load_object(results_file)
    x_list = []
    g_list = []
    y_axes = [-4,4]
    for i, r  in enumerate(results.results_list):
        x, g = aggregate_g(r)
        x_list.append(x)
        g_list.append(g)
    array_functions.plot_line_sub(x_list, g_list, title=results_file, y_axes=y_axes)
    x = 1
    def run_experiments(self):
        data_file = self.configs.data_file
        data_and_splits = helper_functions.load_object(data_file)
        data_and_splits.data.repair_data()
        assert self.configs.num_splits <= len(data_and_splits.splits)
        data_and_splits.labels_to_keep = self.configs.labels_to_keep
        data_and_splits.labels_to_not_sample = self.configs.labels_to_not_sample
        data_and_splits.target_labels = self.configs.target_labels
        data_and_splits.data.repair_data()
        results_file = self.configs.results_file
        comm = mpi_utility.get_comm()
        if os.path.isfile(results_file):
            if mpi_utility.is_group_master():
                print results_file + ' already exists - skipping'
            return            
        if mpi_utility.is_group_master():
            hostname = helper_functions.get_hostname()
            print '(' + hostname  + ') Running experiments: ' + results_file
        learner = self.configs.learner
        learner.run_pre_experiment_setup(data_and_splits)
        num_labels = len(self.configs.num_labels)
        num_splits = self.configs.num_splits
        #method_results = results.MethodResults(n_exp=num_labels, n_splits=num_splits)
        method_results = self.configs.method_results_class(n_exp=num_labels, n_splits=num_splits)
        for i, nl in enumerate(self.configs.num_labels):
            method_results.results_list[i].num_labels = nl

        split_idx = self.configs.split_idx
        if split_idx is not None:
            num_labels_list = list(itertools.product(range(num_labels), [split_idx]))
        else:
            num_labels_list = list(itertools.product(range(num_labels), range(num_splits)))

        shared_args = (self, results_file, data_and_splits, method_results)
        args = [shared_args + (i_labels, split) for i_labels,split in num_labels_list]
        if self.configs.use_pool:
            pool = multiprocessing_utility.LoggingPool(processes=self.configs.pool_size)
            all_results = pool.map(_run_experiment, args)
        else:
            all_results = [_run_experiment(a) for a in args]
        for curr_results,s in zip(all_results,num_labels_list):
            if curr_results is None:
                continue
            i_labels, split = s
            method_results.set(curr_results, i_labels, split)

        method_results.configs = self.configs
        if self.configs.should_load_temp_data:
            helper_functions.save_object(results_file,method_results)
            for i_labels, split in num_labels_list:
                num_labels = self.configs.num_labels[i_labels]
                _delete_temp_split_files(results_file, num_labels, split)
            _delete_temp_folder(results_file)
Esempio n. 18
0
def create_spatial_data(dir='climate-month'):
    file = dir + '/processed_data.pkl'
    locs, y, ids = helper_functions.load_object(file)
    y = y.T
    is_missing_loc = (~np.isfinite(locs)).any(1)
    locs = locs[~is_missing_loc, :]
    y = y[~is_missing_loc, :]
    ids = ids[~is_missing_loc]
    data = data_class.Data(locs, y)
    data.multilabel_to_multisource()
    s = dir + '/raw_data.pkl'
    helper_functions.save_object(s, data)
Esempio n. 19
0
def get_sized_results(file_name):
    file_name_no_suffix = os.path.basename(helper_functions.remove_suffix(file_name, '.pkl'))
    dir_name = os.path.dirname(file_name)
    all_files = os.listdir(dir_name)
    sized_file_name = file_name_no_suffix + '-num_labels='
    files = []
    results = []
    for s in all_files:
        if sized_file_name in s:
            files.append(dir_name + '/' + s)
            results.append(helper_functions.load_object(dir_name + '/' + s))
    return results
Esempio n. 20
0
def create_spatial_data(dir="climate-month"):
    file = dir + "/processed_data.pkl"
    locs, y, ids = helper_functions.load_object(file)
    # y = y.T
    is_missing_loc = (~np.isfinite(locs)).any(1)
    locs = locs[~is_missing_loc, :]
    y = y[~is_missing_loc, :]
    ids = ids[~is_missing_loc]
    data = data_class.Data(locs, y)
    data.multilabel_to_multisource()
    s = dir + "/raw_data.pkl"
    helper_functions.save_object(s, data)
Esempio n. 21
0
def subset_1_per_instance_id():
    data = helper_functions.load_object('data_sets/' + create_data_set.adience_aligned_cnn_file)
    to_keep = array_functions.false(data.n)
    all_ids = np.unique(data.instance_ids)
    for id in all_ids:
        has_id = (data.instance_ids == id).nonzero()[0]
        to_keep[has_id[0]] = True
        pass
    to_keep = to_keep & data.is_labeled
    data = data.get_subset(to_keep)
    helper_functions.save_object('data_sets/' + create_data_set.adience_aligned_cnn_1_per_instance_id_file,
                                 data)
    pass
Esempio n. 22
0
def vis_data():
    s = 'data_sets/' + data_file_dir + '/raw_data.pkl'
    data = helper_functions.load_object(s)
    x = data.x
    y = data.y
    for i in range(data.p):
        xi = x[:, i]
        title = 'Feature Names Missing'
        if data.feature_names is not None:
            title = data.feature_names[i]
        array_functions.plot_2d(xi, y, data_set_ids=data.data_set_ids, title=title)
        pass

    pass
Esempio n. 23
0
def subset_1_per_instance_id():
    data = helper_functions.load_object(
        'data_sets/' + create_data_set.adience_aligned_cnn_file)
    to_keep = array_functions.false(data.n)
    all_ids = np.unique(data.instance_ids)
    for id in all_ids:
        has_id = (data.instance_ids == id).nonzero()[0]
        to_keep[has_id[0]] = True
        pass
    to_keep = to_keep & data.is_labeled
    data = data.get_subset(to_keep)
    helper_functions.save_object(
        'data_sets/' +
        create_data_set.adience_aligned_cnn_1_per_instance_id_file, data)
    pass
Esempio n. 24
0
def create_drosophila():
    data = helper_functions.load_object("drosophilia/processed_data.pkl")
    x, y = data
    y = np.reshape(y, y.shape[0])
    I = np.random.choice(x.shape[0], size=500, replace=False)
    x = x[I, :]
    y = y[I]
    data = data_class.Data()
    data.x = x
    data.y = y
    data.set_train()
    data.set_target()
    data.set_true_y()
    data.is_regression = True

    helper_functions.save_object(drosophila_file, data)
Esempio n. 25
0
def run_main():
    import create_data_set
    from methods import method
    learner = method.NadarayaWatsonMethod()
    #s = create_data_set.synthetic_step_transfer_file
    #s = create_data_set.synthetic_delta_linear_file
    #s = create_data_set.synthetic_step_linear_transfer_file
    #s = create_data_set.boston_housing_raw_data_file % '-13'
    #s = create_data_set.concrete_file % '-7'
    #s = create_data_set.concrete_file % '-feat=0'
    #s = create_data_set.bike_file % '-feat=1'
    #s = create_data_set.wine_file % '-small-11'
    #s = create_data_set.boston_housing_raw_data_file % ''
    #learner = None
    data = helper_functions.load_object(s)
    viz_features(data.x, data.y, data.data_set_ids, learner=learner)
Esempio n. 26
0
def create_drosophila():
    data = helper_functions.load_object('drosophilia/processed_data.pkl')
    x, y = data
    y = np.reshape(y, y.shape[0])
    I = np.random.choice(x.shape[0], size=500, replace=False)
    x = x[I, :]
    y = y[I]
    data = data_class.Data()
    data.x = x
    data.y = y
    data.set_train()
    data.set_target()
    data.set_true_y()
    data.is_regression = True

    helper_functions.save_object(drosophila_file, data)
Esempio n. 27
0
def run_main():
    import create_data_set
    from methods import method
    learner = method.NadarayaWatsonMethod()
    #s = create_data_set.synthetic_step_transfer_file
    #s = create_data_set.synthetic_delta_linear_file
    #s = create_data_set.synthetic_step_linear_transfer_file
    #s = create_data_set.boston_housing_raw_data_file % '-13'
    #s = create_data_set.concrete_file % '-7'
    #s = create_data_set.concrete_file % '-feat=0'
    #s = create_data_set.bike_file % '-feat=1'
    #s = create_data_set.wine_file % '-small-11'
    #s = create_data_set.boston_housing_raw_data_file % ''
    #learner = None
    data = helper_functions.load_object(s)
    viz_features(data.x,data.y,data.data_set_ids,learner=learner)
Esempio n. 28
0
def create_time_series(
    label_to_use=0, series_to_use=0, num_instances=None, normalize_x=False, save_data=True, name="CO2_emissions"
):
    file = name + "/processed_data.pkl"
    all_data = []
    for i in series_to_use:
        y, ids = helper_functions.load_object(file)
        y_to_use = y[:, i, :]
        print str(i) + ": " + ids[i]
        data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[i]]))
        data.is_regression = True
        data.keep_series(label_to_use)
        data = data.get_min_range()
        data.smooth_missing()
        data = data.get_nth(7)
        data.reset_x()
        data.x = data.x.astype(np.float)
        if num_instances is not None:
            data = data.get_range([0, num_instances])
        data = data.get_range([1000, 1500])
        if normalize_x:
            data.x -= data.x.min()
            data.x /= data.x.max()
        data = data.create_data_instance()
        try:
            if len(series_to_use) > 1:
                data.data_set_ids[:] = i
        except:
            pass
        all_data.append(data)
        # perc_used = data.get_perc_used()
    data = all_data[0]
    del all_data[0]
    for di in all_data:
        data.combine(di)
    if num_instances is not None:
        pass
        s = name + "-%s-%d" % (str(series_to_use), num_instances)
    else:
        s = name + "-%s" % str(series_to_use)
    if normalize_x:
        s += "-norm"
    s += "/raw_data.pkl"
    # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10)
    array_functions.plot_2d_sub(data.x, data.y, data_set_ids=data.data_set_ids, title=None, sizes=10)
    if save_data:
        helper_functions.save_object(s, data)
Esempio n. 29
0
def create_drought(label_to_use=0,
                   series_to_use=0,
                   num_instances=None,
                   normalize_x=False,
                   save_data=True):
    file = 'drought/processed_data.pkl'
    y, ids = helper_functions.load_object(file)
    y_to_use = y[:, series_to_use, :]
    print str(series_to_use) + ': ' + ids[series_to_use]
    data = data_class.TimeSeriesData(y_to_use,
                                     np.asarray([ids[series_to_use]]))
    data.is_regression = True
    data.keep_series(label_to_use)
    data = data.get_min_range()
    data.smooth_missing()
    data.x = data.x.astype(np.float)
    if num_instances is not None:
        data = data.get_range([0, num_instances])
    if normalize_x:
        data.x -= data.x.min()
        data.x /= data.x.max()
    data = data.create_data_instance()
    # perc_used = data.get_perc_used()
    if num_instances is not None:
        pass
        s = 'drought-%d-%d' % (series_to_use, num_instances)
    else:
        s = 'drought-%d' % series_to_use
    if normalize_x:
        s += '-norm'
    s += '/raw_data.pkl'
    # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10)
    array_functions.plot_2d_sub(data.x,
                                data.y,
                                data_set_ids=data.data_set_ids,
                                title=None,
                                sizes=10)
    if save_data:
        helper_functions.save_object(s, data)
Esempio n. 30
0
import numpy as np
from utility import helper_functions

data = helper_functions.load_object('debug_data.pkl')

A = data['A']
S = data['S']
y = data['y']

v = S.dot(y)
try:
    np.linalg.lstsq(A, v)
    print 'it worked!'
except:
    print 'error caught'
pass
Esempio n. 31
0
        self.std = std
        self.range = range

def gen_stats(data):
    mean = data.true_y.mean()
    std = data.true_y.std()
    range = [data.true_y.min(), data.true_y.max()]
    stats = data_statistics(mean, std, range)
    return stats

def plot_labels(data):
    n, bins, patches = plt.hist(data.true_y, 50, normed=1, facecolor='green', alpha=0.75)

if __name__ == '__main__':
    dirs = [
        synthetic_dir, boston_housing_dir, adience_dir, wine_dir
    ]
    titles = [
        'synthetic', 'housing', 'adience', 'wine'
    ]
    num_rows = 2
    num_cols = math.ceil(len(dirs)/float(num_rows))
    stats = []
    for i, d in enumerate(dirs):
        plt.subplot(2,2,i)
        data = helper_functions.load_object('../' + d + '/split_data.pkl').data
        stats.append(gen_stats(data))
        plot_labels(data)
        plt.title(titles[i])
    plt.show()
    pass
Esempio n. 32
0
def create_table():
    vis_configs = configs_lib.VisualizationConfigs()
    viz_params = configs_lib.viz_params
    n = len(viz_params)

    if getattr(vis_configs, 'figsize', None):
        fig = plt.figure(figsize=vis_configs.figsize)
    else:
        fig = plt.figure()
    # fig.suptitle('Results')
    # num_rows = min(n, configs_lib.max_rows)
    cell_text = [[np.nan]*len(vis_configs.results_files) for i in range(len(viz_params))]
    cols = []
    rows = []
    size_to_vis = vis_configs.size_to_vis
    baseline_perf = []
    all_perf = []
    data_names = []
    for data_set_idx, curr_viz_params in enumerate(viz_params):
        vis_configs = configs_lib.VisualizationConfigs(**curr_viz_params)
        param_text = []
        if len(rows) <= data_set_idx:
            rows.append(vis_configs.results_dir)
        method_idx = 0
        mean_perf = []

        # Used for column names if not provided by users
        data_names.append(vis_configs.title)
        for file, legend_str in vis_configs.results_files:
            if len(cols) <= method_idx:
                cols.append(legend_str)
            if not os.path.isfile(file):
                print file + ' doesn''t exist - skipping'
                #assert False, 'Creating Table doesn''t work with missing files'
                cell_text[data_set_idx][method_idx] = 'Missing'
                mean_val = np.nan
            else:
                results = helper_functions.load_object(file)
                sized_results = get_sized_results(file)
                results = combine_results(results, sized_results)
                processed_results = results.compute_error_processed(vis_configs.loss_function, normalize_output=True)
                sizes = results.sizes
                #assert size_to_vis in sizes
                #size_idx = array_functions.find_first_element(sizes, size_to_vis)
                size_idx = 1
                # sizes = sizes[0:4]
                s = legend_str
                if s is None:
                    s = results.configs.learner.name_string
                highs = np.asarray(processed_results.means) + np.asarray(processed_results.highs)
                lows = np.asarray(processed_results.means) - np.asarray(processed_results.lows)
                mean_val = processed_results.means[size_idx]
                var = (highs-lows)[size_idx]/2
            latex_str = '-'
            if mean_val < 1000:
                #latex_str = '%.1f \\pm %.1f' % (mean_val, var)
                latex_str = '%.3f (%.2f)' % (mean_val, var)
                #latex_str = '%.2f(%.2f)' % (mean_val, var)
            cell_text[data_set_idx][method_idx] = latex_str
            if method_idx == vis_configs.baseline_idx:
                baseline_perf.append(mean_val)
            mean_perf.append(mean_val)
            method_idx += 1
        all_perf.append(np.asarray(mean_perf))
        #cell_text.append(param_text)
    relative_improvement = np.zeros((len(all_perf), all_perf[0].size))


    # Create table
    method_names_for_table = vis_configs.method_names_for_table
    latex_text = ''
    for method_name in method_names_for_table:
        latex_text += ' & ' + method_name
    latex_text += '\\\\ \hline \n'
    #latex_text = ' & Ours: Linear & Target Only & LLGC & Reweighting & Offset & SMS & Stacking & Ours with Stacking\\\\ \hline \n'

    # If data names are provided, use them instead of ones in config file
    if vis_configs.data_names_for_table is not None:
        data_names = vis_configs.data_names_for_table
    for row_idx, row_str in enumerate(cell_text):
        latex_text += data_names[row_idx] + ' & '
        for i, cell_str in enumerate(row_str):
            latex_text += ' $' + str(cell_str) + '$'
            if i != len(row_str) - 1:
                latex_text += ' &'
        latex_text += ' \\\\ \\hline\n'
    print latex_text

    # If we don't want the "baseline improvement" row
    if vis_configs.baseline_idx is not None:
        for i in range(relative_improvement.shape[0]):
            #relative_improvement[i, :] = (baseline_perf[i] - all_perf[i]) / baseline_perf[i]
            relative_improvement[i, :] = (baseline_perf[i] - all_perf[i]) / baseline_perf[i]
        mean_relative_improvement = ''
        for ri in relative_improvement.T:
            v = ri[np.isfinite(ri)].mean() * 100
            mean_relative_improvement += ('$%.2f$ & ' % v)
        print 'relative improvement: ' + mean_relative_improvement

    fig, axs = plt.subplots()
    axs.axis('tight')
    axs.axis('off')
    the_table = axs.table(
        cellText=cell_text,
        rowLabels=rows,
        colLabels=cols,
        loc='center'
    )
    the_table.auto_set_font_size(False)
    the_table.set_fontsize(10)
    plt.show()
    print ''
Esempio n. 33
0
import numpy as np
from data_sets import create_data_set
from utility import array_functions
from utility import helper_functions
from datetime import date
from matplotlib import pyplot as pl
from data import data as data_lib

try:
    data = helper_functions.load_object('train.pkl')
except:
    file_name = 'train.csv'
    feat_names, data = create_data_set.load_csv(file_name,
                                                True,
                                                dtype=np.float,
                                                delim=',')
    data = data.astype(np.float)
    Y = data[:, 0]
    X = data[:, 1:]
    data = {'X': X, 'Y': Y}
    helper_functions.save_object('train.pkl', data)
x = data['X']
x /= 256
y = data['Y']
data = data_lib.Data(x, y)
helper_functions.save_object('raw_data.pkl', data)
pass
Esempio n. 34
0
def vis_data():
    s = '../data_sets/' + data_file_dir + '/raw_data.pkl'
    data = helper_functions.load_object(s)
    x = data.x
    y = data.y
    c = MethodConfigs()
    x_mat = vec_to_matrix(x, y)
    #self.set_data_set_defaults('taxi3', source_labels=[1], target_labels=[0], is_regression=True)
    c.source_labels = np.asarray([1])
    c.target_labels = np.asarray([0])
    c.use_validation = True
    I_target = (c.target_labels[0] == data.data_set_ids).nonzero()[0]
    I_to_use = np.random.choice(I_target, 40, replace=False)
    data.y[I_target] = np.nan
    data.y[I_to_use] = data.true_y[I_to_use]
    learner = local_transfer_methods.LocalTransferDeltaNew(c)
    v = 1
    learner.cv_params['sigma_target'] = learner.create_cv_params(-v, v)
    learner.cv_params['sigma_b'] = learner.create_cv_params(-v, v)
    learner.cv_params['sigma_alpha'] = learner.create_cv_params(-v, v)
    #learner.transform = None
    output = learner.train_and_test(data).prediction

    fig = plt.figure(0)
    plt.title('TODO')
    plt.axis('off')

    I_target = data.get_transfer_inds(c.target_labels)
    vals_to_plot = [
        np.abs(output.ft - output.true_y)**1,
        np.abs(output.y_s + output.b - output.true_y)**1,
        output.alpha,
        np.abs(output.y - output.true_y)**1,
    ]
    titles = [
        'Target Function \nError',
        'Adapted Source \nFunction Error',
        'Mixture Function \n',
        'Final Prediction \nError',
    ]
    min_error = min([vals_to_plot[i].min() for i in [0, 1, 3]])
    max_error = max([vals_to_plot[i].max() for i in [0, 1, 3]])
    print output.b
    print output.alpha
    for i, vals in enumerate(vals_to_plot):
        ax = plt.subplot(1, len(vals_to_plot), i + 1)

        ax.set_title(titles[i], fontsize=15)
        #array_functions.plot_heatmap(data.x[I_target], vals, fig=fig, make_subplot=False, sizes=20)
        #vals -= min_error
        #vals /= max_error
        vals -= vals.min()
        vals /= vals.max()
        vals_reshaped = np.reshape(vals, (40, 40))
        plt.pcolormesh(vals_reshaped, cmap=cm.gray, shading='flat', norm=None)
        ax.set_xlabel('Latitude')
        if i == 0:
            ax.set_ylabel('Longitude')
        else:
            ax.set_ylabel('')
        plt.xticks([], [])
        plt.yticks([], [])
    array_functions.move_fig(fig, 1200, 400)
    #plt.tight_layout(pad=0, h_pad=0, w_pad=0)
    #plt.wsp
    plt.subplots_adjust(left=0.05, right=0.95, top=0.8, bottom=0.1)
    plt.show(block=True)
    print ''
Esempio n. 35
0
import active_project_configs as configs_lib
from utility import helper_functions
import sklearn
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
pc = configs_lib.ProjectConfigs()

main_configs = configs_lib.MainConfigs(pc)

data_file = '../' + main_configs.data_file
data_and_splits = helper_functions.load_object(data_file)
learner = main_configs.learner
skl = Ridge(normalize=True)

x = data_and_splits.data.x
y = data_and_splits.data.y

select_k_best = SelectKBest(f_regression, 50)
x = select_k_best.fit_transform(x, y)
skl.fit(x, y)
score = skl.score(x, y)
print 'R2 Score: ' + str(score)
pass
Esempio n. 36
0
import scipy.io as sio
import os
import numpy as np
from methods import method
from data import data as data_lib
from data_sets.create_data_split import DataSplitter
from copy import deepcopy
from loss_functions import loss_function
from utility import array_functions
from utility import helper_functions
from methods import transfer_methods
data = helper_functions.load_object('raw_data.pkl')
from configs import base_configs

data_splitter = DataSplitter()
data_splitter.data = data

splits = data_splitter.generate_splits(data.y)

split_data = data_lib.SplitData(data, splits)
use_transfer = True

use_regression = False
m = base_configs.MethodConfigs()
m.use_validation = True
if use_transfer:
    assert not use_regression
    m.loss_function = loss_function.ZeroOneError()
    m.cv_loss_function = loss_function.ZeroOneError()
    transfer_learner = transfer_methods.StackingTransfer(deepcopy(m))
    transfer_learner.base_learner = method.SKLLogisticRegression(deepcopy(m))
Esempio n. 37
0
def load_taxi_data(num_files_to_load=np.inf,
                   num_bins=50,
                   use_alternate=True,
                   return_coords=False):
    all_files = [
        f for f in os.listdir(data_dir) if path.isfile(path.join(data_dir, f))
    ]
    x = []
    y = []
    time = []
    has_passenger = []
    #combined_data_file = 'combined_data.pkl'
    combined_data_file = 'C:/PythonFramework/data_sets/taxi/combined_data.pkl'
    if path.exists(combined_data_file):
        print 'loading combined data...'
        all_data = helper_functions.load_object(combined_data_file)
        print 'done loading data'
    else:
        for i, file in enumerate(all_files):
            if i == num_files_to_load:
                break
            if i >= 535:
                break
            file_data = load_csv(path.join(data_dir, file),
                                 has_field_names=False,
                                 delim=str(' '))[1]
            y.append(file_data[:, 0])
            x.append(file_data[:, 1])
            has_passenger.append(file_data[:, 2])
            time.append(file_data[:, 3])
            print i
        all_data = {
            'x': x,
            'y': y,
            'has_passenger': has_passenger,
            'time': time
        }
        print 'saving combined data...'
        helper_functions.save_object(combined_data_file, all_data)
    x = all_data['x']
    y = all_data['y']
    has_passenger = all_data['has_passenger']
    time = all_data['time']
    x_all = np.concatenate(x)
    y_all = np.concatenate(y)
    time_all = np.concatenate(time)

    has_passenger_all = np.concatenate(has_passenger)

    pickup_inds = get_pickup_inds(x_all, y_all, time_all, has_passenger_all)
    if just_pickup:
        x_all = x_all[pickup_inds]
        y_all = y_all[pickup_inds]
        has_passenger_all = has_passenger_all[pickup_inds]
        time_all = time_all[pickup_inds]
    #x_bounds = [-122.45677419354838, -122.38322580645161]
    #y_bounds = [37.738054968287521, 37.816543340380548]

    x_bounds = [-122.48, -122.35]
    y_bounds = [37.7, 37.84]

    #x_bounds = [-np.inf, np.inf]
    #y_bounds = x_bounds
    is_in_range = in_range(x_all, *x_bounds) & in_range(y_all, *y_bounds)
    x_all = x_all[is_in_range]
    y_all = y_all[is_in_range]
    x_all = quantize_loc(x_all, num_bins)
    y_all = quantize_loc(y_all, num_bins)
    time_all = time_all[is_in_range]

    hours = 9 * np.ones(time_all.shape)

    get_hour_vec = np.vectorize(get_hour)
    hours = get_hour_vec(time_all)
    '''
    get_day_vec = np.vectorize(get_day)
    days = get_day_vec(time_all)
    '''
    has_passenger_all = has_passenger_all[is_in_range]

    suffix = '3'
    is_morning = (hours == 9)
    is_night = (hours == 18)
    #is_morning = (hours == 6) & (days == 21)
    #is_night = (hours == 18) & (days == 21)
    #is_morning = (days == 21)
    #is_night = (days == 24)
    if use_alternate:
        is_morning = (hours >= 5) & (hours <= 12)
        is_night = (hours >= 17)
        #is_morning = days == 21
        #is_night = days == 24
        #is_morning = (has_passenger_all == 1) & (days == 21) & is_morning
        #is_night = (has_passenger_all == 1) & (days == 21) & is_night
        #is_morning = (has_passenger_all == 1) & (hours == 6)
        #is_night = (has_passenger_all == 1) & (hours == 18)
        suffix = '2'

    suffix += '-' + str(num_bins)
    #print np.unique(days)

    #is_morning = days == 4
    #is_night = days == 8

    day_locs, day_values = count_cars(x_all[is_morning], y_all[is_morning],
                                      num_bins)
    night_locs, night_values = count_cars(x_all[is_night], y_all[is_night],
                                          num_bins)
    if return_coords:
        day_locs = bin_to_coordinates(day_locs, x_bounds, y_bounds, num_bins)
        night_locs = bin_to_coordinates(night_locs, x_bounds, y_bounds,
                                        num_bins)
    '''
    if use_alternate:
        I = (day_values > 0) | (night_values > 0)
        I = I & (day_values > 0) & (night_values > 0)
    else:
        I = (day_values > 5) | (night_values > 5)
        I = I & (day_values > 0) & (night_values > 0)
    relative_diff = np.max(day_values[I] - night_values[I]) / day_values[I]
    '''
    #array_functions.plot_heatmap(day_locs[I], relative_diff, sizes=10, alpha=1, subtract_min=False)
    return day_locs, day_values, night_locs, night_values, suffix
Esempio n. 38
0
def vis_data():
    s = 'data_sets/' + data_file_dir + '/raw_data.pkl'
    data = helper_functions.load_object(s)
    x = data.x
    y = data.y
    titles = ['', '']
    label_idx = [0, 1]
    if plot_climate:
        img_path = 'C:/PythonFramework/far_transfer/figures/climate-terrain.png'
        image = imread(img_path)
        label_idx = [0, 4]
    if data_file_dir == 'climate-month':
        titles = [
            'Max Temperature Gradient: January',
            'Max Temperature Gradient: April'
        ]
        label_idx = [0, 4]
    elif data_file_dir == 'irs-income':
        titles = ['Income', 'Household Size']
    elif data_file_dir == 'zillow-traffic':
        titles = ['Morning Taxi Pickups', 'Housing Prices']
    elif data_file_dir == 'kc-housing-spatial-floors':
        titles = ['House Prices: 1 Floor', 'House Prices: 2 or More Floors']

    if plot_features:

        for i in range(data.p):
            xi = x[:, i]
            title = 'Feature Names Missing'
            if data.feature_names is not None:
                title = data.feature_names[i]
            array_functions.plot_2d(xi,
                                    y,
                                    data_set_ids=data.data_set_ids,
                                    title=title)
    else:
        for i, title in zip(label_idx, titles):
            #plt.close()
            I = data.data_set_ids == i
            if plot_gradients or plot_values:
                g, v = estimate_gradients(x, y, I)
                if plot_values:
                    g = v
                #g = np.log(g)
                #g -= g.min()
                #g += g.max()/10.0
                #g /= g.max()
                if data_file_dir == 'zillow-traffic':
                    if i == 0:
                        pass
                        g -= g.min()
                        g /= g.max()
                        #g **= .5
                    else:
                        pass
                        g -= g.min()
                        g /= g.max()
                        #g **= .5
                else:
                    if i == 0:
                        g -= g.min()
                        g /= g.max()
                        g = np.sqrt(g)
                    else:
                        g -= g.min()
                        g /= g.max()
                        g **= 1
                #array_functions.plot_heatmap(g, sizes=dot_sizes, fig=fig, title=title)
                fig = plt.figure(i)
                plt.title(title)
                plt.axis('off')
                plt.imshow(g)
                array_functions.move_fig(fig, 750, 400)
                #plt.show(block=False)
            else:
                fig = plt.figure(4)
                array_functions.plot_heatmap(x[I, :],
                                             y[I],
                                             sizes=dot_sizes,
                                             fig=fig,
                                             title=title)
                if plot_climate:
                    plt.imshow(image, zorder=0, extent=[-90, -78, 33.5, 38])
                    array_functions.move_fig(fig, 1400, 600)
        plt.show(block=True)

    pass
Esempio n. 39
0
def run_visualization():
    vis_configs = configs_lib.VisualizationConfigs()
    #data_sets = configs_lib.data_sets_for_exps
    #n = len(data_sets)
    viz_params = configs_lib.viz_params
    n = len(viz_params)

    if getattr(vis_configs, 'figsize', None):
        fig = plt.figure(figsize=vis_configs.figsize)
    else:
        fig = plt.figure()
    #fig.suptitle('Results')
    #num_rows = min(n, configs_lib.max_rows)
    num_rows = min(n, vis_configs.max_rows)
    num_cols = math.ceil(float(n) / num_rows)
    markers = [
        's', '*', '>', '^', 'v', 'X', 'P', 'd', '*'
    ]
    for config_idx, curr_viz_params in enumerate(viz_params):
        subplot_idx = config_idx + 1
        plt.subplot(num_rows,num_cols,subplot_idx)
        axis = [0, 1, np.inf, -np.inf]
        vis_configs = configs_lib.VisualizationConfigs(**curr_viz_params)
        sizes = None
        min_x = np.inf
        max_x = -np.inf
        marker_idx = -1
        is_file_missing = False
        for file, legend_str in vis_configs.results_files:
            if not os.path.isfile(file):
                is_file_missing = True
                print file + ' doesn''t exist - skipping'
                assert len(viz_params) == 1 or \
                       vis_configs.show_legend_on_all or \
                       vis_configs.show_legend_on_missing_files  or \
                       not vis_configs.crash_on_missing_files, \
                    'Just to be safe, crashing because files are missing'
                continue
            marker_idx += 1
            results = helper_functions.load_object(file)
            sized_results = get_sized_results(file)
            sizes_to_plot = vis_configs.sizes_to_use
            if sizes_to_plot is not None:
                sizes_to_plot = set(sizes_to_plot)
            results = combine_results(results, sized_results)
            to_remove = list()
            for j, s in enumerate(results.sizes):
                if sizes_to_plot is not None and s not in sizes_to_plot:
                    to_remove.append(j)
            for j in reversed(to_remove):
                del results.results_list[j]
            #results.results_list = results.results_list[~to_remove]
            if len(results.sizes) == 0:
                print file + ' has no results for sizes ' + str(sizes_to_plot) + ', skipping'

            #plt.plot([1,2,3], [1,2,3], 'go-', label='line 1', linewidth=2)
            processed_results = results.compute_error_processed(
                vis_configs.loss_function,
                vis_configs.results_features,
                vis_configs.instance_subset,
                normalize_output=True
            )
            sizes = results.sizes

            #sizes = sizes[0:4]
            min_x = min(min_x, sizes.min())
            max_x = max(max_x, sizes.max())
            s = legend_str
            if s is None:
                s = results.configs.learner.name_string
            print 'Plotting: ' + file
            print 'Mean Errors: ' + str(processed_results.means)
            plt.errorbar(sizes,
                         processed_results.means,
                         yerr=[processed_results.lows, processed_results.highs],
                         label=s,
                         marker=markers[marker_idx],
                         markersize=8
            )
            highs = np.asarray(processed_results.means) + np.asarray(processed_results.highs)
            lows = np.asarray(processed_results.means) - np.asarray(processed_results.lows)
            axis[3] = max(axis[3], highs.max() +  .2*lows.min())
            axis[2] = min(axis[2], .9*lows.min())
        if sizes is None:
            print 'Empty plot - skipping'
            continue
        plt.title(vis_configs.title, fontsize=vis_configs.fontsize)
        axis_range = max_x - min_x
        axis[1] = max_x + .1*axis_range
        axis[0] = min_x - .1*axis_range
        #show_x_label = num_rows == 1 or subplot_idx > (num_rows-1)*num_cols
        #show_x_label = num_rows == 1 or subplot_idx == 8
        #show_x_label = subplot_idx == 9
        #show_x_label = subplot_idx == 8
        show_x_label = True
        show_y_label = num_cols == 1 or subplot_idx % num_cols == 1 or vis_configs.always_show_y_label

        if show_x_label:
            plt.xlabel(vis_configs.x_axis_string)
        if show_y_label:
            plt.ylabel(vis_configs.y_axis_string)
        #axis[1] *= 2
        axis[3] *= 1
        ylims = getattr(vis_configs,'ylims',None)
        if ylims is not None:
            axis[2] = ylims[0]
            axis[3] = ylims[1]
        plt.axis(axis)
        if config_idx == 2 or vis_configs.show_legend_on_all or len(viz_params) == 1\
                or (vis_configs.show_legend_on_missing_files and is_file_missing):
            plt.legend(loc='upper right', fontsize=vis_configs.fontsize)
    #fig.tight_layout(rect=[.05,.05,.95,.95])
    if getattr(vis_configs,'borders',None):
        left,right,top,bottom = vis_configs.borders
        fig.subplots_adjust(left=left,right=right,top=top,bottom=bottom)
    if vis_configs.use_tight_layout:
        plt.tight_layout()
    plt.show()
    x = 1

    '''
Esempio n. 40
0
def vis_data():
    pc = configs_lib.ProjectConfigs(bc.DATA_KC_HOUSING)
    #pc = configs_lib.ProjectConfigs(bc.DATA_CLIMATE_MONTH)
    pc.active_method = configs_lib.ACTIVE_CLUSTER_PURITY
    #pc.active_method = configs_lib.ACTIVE_CLUSTER
    #pc.active_method = configs_lib.ACTIVE_RANDOM
    pc.fixed_sigma_x = False
    pc.no_spectral_kernel = False
    pc.no_f_x = False
    pc.active_items_per_iteration = 10
    use_oracle_target = False

    main_configs = configs_lib.MainConfigs(pc)
    data_file = '../' + main_configs.data_file
    data_and_splits = helper_functions.load_object(data_file)
    data = data_and_splits.get_split(0, 0)
    is_target = data.data_set_ids == main_configs.target_labels[0]
    is_source = data.data_set_ids == main_configs.source_labels[0]
    data.reveal_labels(is_source.nonzero()[0])
    data.type = data_lib.TYPE_TARGET*np.ones(data.n)
    data.type[is_source] = data_lib.TYPE_SOURCE
    x = data.x
    y = data.y


    learner = main_configs.learner
    learner.use_oracle_target = use_oracle_target
    if pc.active_method == configs_lib.ACTIVE_CLUSTER_PURITY and False:
        learner.instance_selector.cv_params['sigma_y'] = [1]
    print 'Experiment: ' + learner.prefix
    results = learner.train_and_test(data)
    queried_data = results.results_list[0].queried_idx
    selected_data = data.get_subset(queried_data)

    fig = plt.figure(0, figsize=(12, 5))
    plt.title('TODO')
    plt.axis('off')

    x1 = data.x[:, 0]
    x1_sel = selected_data.x[:, 0]
    if data.p == 1:
        x2 = data.true_y
        x2_sel = selected_data.true_y
    else:
        assert data.p == 2
        x2 = data.x[:, 1]
        x2_sel = selected_data.x[:, 1]

    plt.subplot(1, 3, 1)
    plt.scatter(x1[is_target], x2[is_target], c='b', s=10)
    plt.scatter(x1_sel, x2_sel, c='r', s=20)


    if data.p == 2:
        plt.subplot(1, 3, 2)

        target_data = data.get_subset(is_target)
        target_data.y = target_data.true_y.copy()

        nw_method = method.NadarayaWatsonMethod()
        y_pred = nw_method.train_and_test(target_data).prediction.y
        means, _, _, _ = binned_statistic_2d(target_data.x[:, 0], target_data.x[:, 1], y_pred, bins=30)
        #means = means[:, ::-1]
        #means = means[::-1, :]
        means[~np.isfinite(means)] = -1
        plt.pcolormesh(means, cmap='RdBu')
        plt.colorbar()

        plt.subplot(1, 3, 3)

        source_data = data.get_subset(is_source)
        source_data.y = source_data.true_y.copy()

        nw_method = method.NadarayaWatsonMethod()
        y_pred = nw_method.train_and_test(source_data).prediction.y
        means, _, _, _ = binned_statistic_2d(source_data.x[:, 0], source_data.x[:, 1], y_pred, bins=30)
        # means = means[:, ::-1]
        # means = means[::-1, :]
        means[~np.isfinite(means)] = -1
        plt.pcolormesh(means, cmap='RdBu')
        plt.colorbar()

    plt.show()
Esempio n. 41
0
import active_project_configs as configs_lib
from utility import helper_functions
import sklearn
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
pc = configs_lib.ProjectConfigs()

main_configs = configs_lib.MainConfigs(pc)

data_file = '../' + main_configs.data_file
data_and_splits = helper_functions.load_object(data_file)
learner = main_configs.learner
skl = Ridge(normalize=True)

x = data_and_splits.data.x
y = data_and_splits.data.y

select_k_best = SelectKBest(f_regression, 50)
x = select_k_best.fit_transform(x,y)
skl.fit(x,y)
score = skl.score(x,y)
print 'R2 Score: ' + str(score)
pass