def create_pollution(
    labels_to_use=np.arange(2), series_to_use=0, num_instances=None, normalize_xy=True, save_data=True
):
    file = "pollution/processed_data.pkl"
    y, ids = helper_functions.load_object(file)
    y_to_use = y[:, series_to_use, :]
    print str(series_to_use) + ": " + ids[series_to_use]
    data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[series_to_use]]))
    data.is_regression = True
    data.keep_series(labels_to_use)
    data = data.get_min_range()
    data.smooth_missing()
    data.x = data.x.astype(np.float)
    if num_instances is not None:
        data = data.get_range([0, num_instances])
    if normalize_xy:
        data.reset_x()
        data.normalize_y()

    data = data.create_data_instance()
    # perc_used = data.get_perc_used()
    if num_instances is not None:
        pass
        s = "pollution-%d-%d" % (series_to_use, num_instances)
    else:
        s = "pollution-%d" % series_to_use
    if normalize_xy:
        s += "-norm"
    s += "/raw_data.pkl"
    # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10)
    array_functions.plot_2d_sub(data.x, data.y, data_set_ids=data.data_set_ids, title=None, sizes=10)
    if save_data:
        helper_functions.save_object(s, data)
def create_synthetic_cross_transfer():
    slope = 5
    target_fun = lambda x: slope * x
    source_fun = lambda x: -slope * x + 5
    data = create_synthetic_regression_transfer(target_fun, source_fun)
    s = synthetic_cross_file
    helper_functions.save_object(s, data)
def create_synthetic_flip_transfer(file_dir="", dim=1):
    n_target = 100
    n_source = 100
    n = n_target + n_source
    sigma = 0.2
    data = data_class.Data()
    data.x = np.random.uniform(0, 1, (n, dim))
    data.data_set_ids = np.zeros(n)
    data.data_set_ids[n_target:] = 1
    data.y = np.zeros(n)
    data.y[(data.data_set_ids == 0) & (data.x[:, 0] >= 0.5)] = 2
    data.y[(data.data_set_ids == 1) & (data.x[:, 0] >= 0.5)] = 1
    data.y[(data.data_set_ids == 0) & (data.x[:, 0] <= 0.5)] = 1
    data.y[(data.data_set_ids == 1) & (data.x[:, 0] <= 0.5)] = 2
    data.y += np.random.normal(0, sigma, n)
    data.set_train()
    data.set_true_y()
    data.is_regression = True
    if dim == 1:
        array_functions.plot_2d(data.x, data.y, data.data_set_ids)
    s = synthetic_flip_file
    if dim > 1:
        s = synthetic_step_kd_transfer_file % dim
    if file_dir != "":
        s = file_dir + "/" + s
    helper_functions.save_object(s, data)
def create_synthetic_hypothesis_transfer(n=500, p=50, kt=1, ks=1, sigma=1.0, sigma_s=0.3):
    wt = np.random.normal(0, sigma, p)
    all_data, w_eff = create_synthetic_linear_classification(n=n, p=p, sigma=sigma, w=wt)
    x = all_data.x
    all_data.data_set_ids = np.zeros(n)
    wt = w_eff
    data_set_counter = 1
    diffs = []
    is_target = array_functions.false(kt + ks)
    is_target[:kt] = True
    all_data.true_w = np.zeros((ks + kt + 1, p))
    all_data.true_w[0, :] = wt
    for i, val in enumerate(is_target):
        data_set_id = data_set_counter
        data_set_counter += 1
        if val:
            ws = wt + np.random.normal(0, sigma_s, p)
            ws = wt
        else:
            ws = np.random.normal(0, sigma, p)
        source_data, ws = create_synthetic_linear_classification(w=ws, x=x)
        source_data.data_set_ids = data_set_id * np.ones(n)
        # source_data.true_y *= (i+2)
        source_data.y = source_data.true_y
        all_data.combine(source_data)
        diff = norm(wt / norm(wt) - ws / norm(ws))
        diffs.append(diff)
        all_data.true_w[data_set_id, :] = ws
    all_data.true_w = all_data.true_w.T
    all_data.metadata = dict()
    all_data.metadata["true_w"] = all_data.true_w
    s = synthetic_hypothesis_transfer_class_file % (
        str(n) + "-" + str(p) + "-" + str(sigma) + "-" + str(sigma_s) + "-" + str(kt) + "-" + str(ks)
    )
    helper_functions.save_object(s, all_data)
def _run_experiment_args(self, results_file, data_and_splits, method_results, i_labels, split):
    num_labels = self.configs.num_labels[i_labels]
    s = str(num_labels) + '-' + str(split)
    curr_results = _load_temp_split_file(results_file, num_labels, split)
    if curr_results:
        return curr_results
    #print 'num_labels-split: ' + s
    temp_file_name = _temp_split_file_name(results_file, num_labels, split)
    temp_dir_root = helper_functions.remove_suffix(temp_file_name, '.pkl')
    temp_dir = temp_dir_root + '/CV-temp/'
    curr_data = data_and_splits.get_split(split, num_labels)
    learner = self.configs.learner
    curr_learner = copy.deepcopy(learner)
    curr_learner.split_idx_str = s
    curr_learner.temp_dir = temp_dir
    curr_results = curr_learner.train_and_test(curr_data)
    if mpi_utility.is_group_master():
        helper_functions.save_object(_temp_split_file_name(results_file,num_labels,split),curr_results)
        helper_functions.delete_dir_if_exists(temp_dir_root)
    if mpi_utility.is_group_master():
        if hasattr(curr_learner, 'best_params'):
            print s + '-' + str(curr_learner.best_params) + ' Error: ' + str(curr_results.compute_error(self.configs.loss_function))
        else:
            print s + ' Done'
    return curr_results
def split_data(file, configs):
    data = helper_functions.load_object(file)
    splitter = DataSplitter()
    splitData = data_lib.SplitData()
    splitData.data = data
    num_splits = 30
    perc_train = .8
    keep_for_splitting = None
    if configs.split_data_set_ids is not None:
        keep_for_splitting = array_functions.false(data.n)
        keep_for_splitting[data.data_set_ids == 0] = True
    #Pretend data_set_ids is a label vector to ensure each data set is split equally
    if data.is_regression and data.data_set_ids is not None:
        assert len(data.data_set_ids) == data.n
        is_regression = False
        splitData.splits = splitter.generate_splits(
            data.data_set_ids,
            num_splits,
            perc_train,
            is_regression,
            keep_for_splitting
        )
    else:
        splitData.splits = splitter.generate_splits(
            data.y,
            num_splits,
            perc_train,
            data.is_regression,
            keep_for_splitting
        )
    splitData.data_set_ids_to_keep = configs.data_set_ids_to_keep
    split_dir = os.path.dirname(file)
    save_file = split_dir + '/split_data.pkl'
    helper_functions.save_object(save_file,splitData)
    return splitData
def create_kc_housing():
    file = "kc_housing/processed_data.pkl"
    x, y = helper_functions.load_object(file)
    data = data_class.Data(x, y)
    data.is_regression = True
    s = kc_housing_file
    helper_functions.save_object(s, data)
def create_synthetic_delta_linear_transfer():
    slope = 5
    target_fun = lambda x: slope * x
    source_fun = lambda x: slope * x + 4
    data = create_synthetic_regression_transfer(target_fun, source_fun)
    array_functions.plot_2d(data.x, data.y, data.data_set_ids, title="Linear Delta Data Set")
    s = synthetic_delta_linear_file
    helper_functions.save_object(s, data)
def create_covtype():
    covtype_data = datasets.fetch_covtype()
    print covtype_data.__dict__
    data = data_class.Data()
    data.x = covtype_data.data
    data.y = covtype_data.target
    helper_functions.save_object("data_sets/covtype/raw_data.pkl")
    pass
def create_wine(data_to_create=WINE_RED):
    red_file = "wine/winequality-red.csv"
    white_file = "wine/winequality-white.csv"
    field_names, red_data = load_csv(red_file, delim=";")
    white_data = load_csv(white_file, delim=";")[1]

    if data_to_create == WINE_TRANSFER:
        red_ids = np.zeros((red_data.shape[0], 1))
        white_ids = np.ones((white_data.shape[0], 1))
        red_data = np.hstack((red_data, red_ids))
        white_data = np.hstack((white_data, white_ids))
        wine_data = np.vstack((red_data, white_data))

        ids = wine_data[:, -1]
        x = wine_data[:, :-2]
        y = wine_data[:, -2]
        used_field_names = field_names[:-1]
        viz = True
        if viz:
            learner = make_learner()
            # learner = None
            viz_features(x, y, ids, used_field_names, alpha=0.01, learner=learner)
        suffix = "transfer"
    else:
        if data_to_create == WINE_RED:
            wine_data = red_data
            suffix = "red"
        elif data_to_create == WINE_WHITE:
            wine_data = white_data
            suffix = "white"
        else:
            assert False

        ids = None
        x = wine_data[:, :-1]
        y = wine_data[:, -1]
        used_field_names = field_names[:-1]
    data = data_class.Data()
    data.x = data.x = array_functions.standardize(x)
    if data_to_create == WINE_TRANSFER:
        pass
        # feat_idx = 1
        # data.x = array_functions.vec_to_2d(x[:,feat_idx])

    data.y = y
    data.set_train()
    data.set_target()
    data.set_true_y()
    data.data_set_ids = ids
    data.is_regression = True
    """
    data = data.rand_sample(.25, data.data_set_ids == 0)
    data = data.rand_sample(.1, data.data_set_ids == 1)
    s = wine_file % ('-small-' + str(data.p))
    """
    s = wine_file % ("-" + suffix)
    helper_functions.save_object(s, data)
def create_and_save_data(x, y, domain_ids, file):
    data = data_class.Data()
    data.x = array_functions.vec_to_2d(x)
    data.y = y
    data.set_train()
    data.set_target()
    data.set_true_y()
    data.is_regression = True
    data.data_set_ids = domain_ids
    helper_functions.save_object(file, data)
def create_time_series(label_to_use=0,
                       series_to_use=0,
                       num_instances=None,
                       normalize_x=False,
                       save_data=True,
                       name='CO2_emissions'):
    file = name + '/processed_data.pkl'
    all_data = []
    for i in series_to_use:
        y, ids = helper_functions.load_object(file)
        y_to_use = y[:, i, :]
        print str(i) + ': ' + ids[i]
        data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[i]]))
        data.is_regression = True
        data.keep_series(label_to_use)
        data = data.get_min_range()
        data.smooth_missing()
        data = data.get_nth(7)
        data.reset_x()
        data.x = data.x.astype(np.float)
        if num_instances is not None:
            data = data.get_range([0, num_instances])
        data = data.get_range([1000, 1500])
        if normalize_x:
            data.x -= data.x.min()
            data.x /= data.x.max()
        data = data.create_data_instance()
        try:
            if len(series_to_use) > 1:
                data.data_set_ids[:] = i
        except:
            pass
        all_data.append(data)
        # perc_used = data.get_perc_used()
    data = all_data[0]
    del all_data[0]
    for di in all_data:
        data.combine(di)
    if num_instances is not None:
        pass
        s = name + '-%s-%d' % (str(series_to_use), num_instances)
    else:
        s = name + '-%s' % str(series_to_use)
    if normalize_x:
        s += '-norm'
    s += '/raw_data.pkl'
    # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10)
    array_functions.plot_2d_sub(data.x,
                                data.y,
                                data_set_ids=data.data_set_ids,
                                title=None,
                                sizes=10)
    if save_data:
        helper_functions.save_object(s, data)
def create_spatial_data(dir='climate-month'):
    file = dir + '/processed_data.pkl'
    locs, y, ids = helper_functions.load_object(file)
    y = y.T
    is_missing_loc = (~np.isfinite(locs)).any(1)
    locs = locs[~is_missing_loc, :]
    y = y[~is_missing_loc, :]
    ids = ids[~is_missing_loc]
    data = data_class.Data(locs, y)
    data.multilabel_to_multisource()
    s = dir + '/raw_data.pkl'
    helper_functions.save_object(s, data)
    def run_experiments(self):
        data_file = self.configs.data_file
        data_and_splits = helper_functions.load_object(data_file)
        data_and_splits.data.repair_data()
        assert self.configs.num_splits <= len(data_and_splits.splits)
        data_and_splits.labels_to_keep = self.configs.labels_to_keep
        data_and_splits.labels_to_not_sample = self.configs.labels_to_not_sample
        data_and_splits.target_labels = self.configs.target_labels
        data_and_splits.data.repair_data()
        results_file = self.configs.results_file
        comm = mpi_utility.get_comm()
        if os.path.isfile(results_file):
            if mpi_utility.is_group_master():
                print results_file + ' already exists - skipping'
            return            
        if mpi_utility.is_group_master():
            hostname = helper_functions.get_hostname()
            print '(' + hostname  + ') Running experiments: ' + results_file
        learner = self.configs.learner
        learner.run_pre_experiment_setup(data_and_splits)
        num_labels = len(self.configs.num_labels)
        num_splits = self.configs.num_splits
        #method_results = results.MethodResults(n_exp=num_labels, n_splits=num_splits)
        method_results = self.configs.method_results_class(n_exp=num_labels, n_splits=num_splits)
        for i, nl in enumerate(self.configs.num_labels):
            method_results.results_list[i].num_labels = nl

        split_idx = self.configs.split_idx
        if split_idx is not None:
            num_labels_list = list(itertools.product(range(num_labels), [split_idx]))
        else:
            num_labels_list = list(itertools.product(range(num_labels), range(num_splits)))

        shared_args = (self, results_file, data_and_splits, method_results)
        args = [shared_args + (i_labels, split) for i_labels,split in num_labels_list]
        if self.configs.use_pool:
            pool = multiprocessing_utility.LoggingPool(processes=self.configs.pool_size)
            all_results = pool.map(_run_experiment, args)
        else:
            all_results = [_run_experiment(a) for a in args]
        for curr_results,s in zip(all_results,num_labels_list):
            if curr_results is None:
                continue
            i_labels, split = s
            method_results.set(curr_results, i_labels, split)

        method_results.configs = self.configs
        if self.configs.should_load_temp_data:
            helper_functions.save_object(results_file,method_results)
            for i_labels, split in num_labels_list:
                num_labels = self.configs.num_labels[i_labels]
                _delete_temp_split_files(results_file, num_labels, split)
            _delete_temp_folder(results_file)
def create_spatial_data(dir="climate-month"):
    file = dir + "/processed_data.pkl"
    locs, y, ids = helper_functions.load_object(file)
    # y = y.T
    is_missing_loc = (~np.isfinite(locs)).any(1)
    locs = locs[~is_missing_loc, :]
    y = y[~is_missing_loc, :]
    ids = ids[~is_missing_loc]
    data = data_class.Data(locs, y)
    data.multilabel_to_multisource()
    s = dir + "/raw_data.pkl"
    helper_functions.save_object(s, data)
def subset_1_per_instance_id():
    data = helper_functions.load_object('data_sets/' + create_data_set.adience_aligned_cnn_file)
    to_keep = array_functions.false(data.n)
    all_ids = np.unique(data.instance_ids)
    for id in all_ids:
        has_id = (data.instance_ids == id).nonzero()[0]
        to_keep[has_id[0]] = True
        pass
    to_keep = to_keep & data.is_labeled
    data = data.get_subset(to_keep)
    helper_functions.save_object('data_sets/' + create_data_set.adience_aligned_cnn_1_per_instance_id_file,
                                 data)
    pass
def create_concrete(transfer=False):
    file = 'concrete/Concrete_Data.csv'
    used_field_names, concrete_data = load_csv(file)

    data = data_class.Data()
    t = ''
    if transfer:
        feat_ind = 0
        domain_ind = (used_field_names == 'age').nonzero()[0][0]
        ages = concrete_data[:, domain_ind]
        domain_ids = np.zeros(ages.shape)
        domain_ids[ages < 10] = 1
        domain_ids[(ages >= 10) & (ages <= 28)] = 2
        domain_ids[ages > 75] = 3
        data.x = concrete_data[:, 0:(concrete_data.shape[1] - 2)]
        #0,3,5
        #data.x = preprocessing.scale(data.x)
        if concrete_num_feats == 1:
            data.x = array_functions.vec_to_2d(data.x[:, feat_ind])
            t = '-feat=' + str(feat_ind)
        elif concrete_num_feats >= data.x.shape[1]:
            t = '-' + str(min(data.x.shape[1], concrete_num_feats))
        else:
            assert False
        data.data_set_ids = domain_ids
    else:
        data.x = concrete_data[:, 0:-1]

    data.y = concrete_data[:, -1]
    data.set_train()
    data.set_target()
    data.set_true_y()
    data.is_regression = True

    viz = False
    if viz:
        to_use = domain_ids > 0
        domain_ids = domain_ids[to_use]
        concrete_data = concrete_data[to_use, :]
        np.delete(concrete_data, domain_ind, 1)
        viz_features(concrete_data, concrete_data[:, -1], domain_ids,
                     used_field_names)

        return
    data.x = array_functions.standardize(data.x)
    #viz_features(data.x,data.y,data.data_set_ids)

    s = concrete_file % t
    helper_functions.save_object(s, data)
Exemple #18
0
def subset_1_per_instance_id():
    data = helper_functions.load_object(
        'data_sets/' + create_data_set.adience_aligned_cnn_file)
    to_keep = array_functions.false(data.n)
    all_ids = np.unique(data.instance_ids)
    for id in all_ids:
        has_id = (data.instance_ids == id).nonzero()[0]
        to_keep[has_id[0]] = True
        pass
    to_keep = to_keep & data.is_labeled
    data = data.get_subset(to_keep)
    helper_functions.save_object(
        'data_sets/' +
        create_data_set.adience_aligned_cnn_1_per_instance_id_file, data)
    pass
def create_synthetic_multitask_transfer():
    slope_source = 8
    target_slope1 = 4
    target_slope2 = 4.5
    source_func = lambda x: slope_source * x
    target_funcs = [
        lambda x: target_slope1 * x + 3, lambda x: target_slope2 * x + 8
    ]
    data = create_synthetic_regression_transfer(target_funcs, source_func)
    array_functions.plot_2d(data.x,
                            data.y,
                            data.data_set_ids,
                            title='Multitask Slant')
    s = synthetic_slant_multitask
    helper_functions.save_object(s, data)
def create_concrete(transfer=False):
    file = "concrete/Concrete_Data.csv"
    used_field_names, concrete_data = load_csv(file)

    data = data_class.Data()
    t = ""
    if transfer:
        feat_ind = 0
        domain_ind = (used_field_names == "age").nonzero()[0][0]
        ages = concrete_data[:, domain_ind]
        domain_ids = np.zeros(ages.shape)
        domain_ids[ages < 10] = 1
        domain_ids[(ages >= 10) & (ages <= 28)] = 2
        domain_ids[ages > 75] = 3
        data.x = concrete_data[:, 0 : (concrete_data.shape[1] - 2)]
        # 0,3,5
        # data.x = preprocessing.scale(data.x)
        if concrete_num_feats == 1:
            data.x = array_functions.vec_to_2d(data.x[:, feat_ind])
            t = "-feat=" + str(feat_ind)
        elif concrete_num_feats >= data.x.shape[1]:
            t = "-" + str(min(data.x.shape[1], concrete_num_feats))
        else:
            assert False
        data.data_set_ids = domain_ids
    else:
        data.x = concrete_data[:, 0:-1]

    data.y = concrete_data[:, -1]
    data.set_train()
    data.set_target()
    data.set_true_y()
    data.is_regression = True

    viz = False
    if viz:
        to_use = domain_ids > 0
        domain_ids = domain_ids[to_use]
        concrete_data = concrete_data[to_use, :]
        np.delete(concrete_data, domain_ind, 1)
        viz_features(concrete_data, concrete_data[:, -1], domain_ids, used_field_names)

        return
    data.x = array_functions.standardize(data.x)
    # viz_features(data.x,data.y,data.data_set_ids)

    s = concrete_file % t
    helper_functions.save_object(s, data)
def create_drosophila():
    data = helper_functions.load_object("drosophilia/processed_data.pkl")
    x, y = data
    y = np.reshape(y, y.shape[0])
    I = np.random.choice(x.shape[0], size=500, replace=False)
    x = x[I, :]
    y = y[I]
    data = data_class.Data()
    data.x = x
    data.y = y
    data.set_train()
    data.set_target()
    data.set_true_y()
    data.is_regression = True

    helper_functions.save_object(drosophila_file, data)
def create_drosophila():
    data = helper_functions.load_object('drosophilia/processed_data.pkl')
    x, y = data
    y = np.reshape(y, y.shape[0])
    I = np.random.choice(x.shape[0], size=500, replace=False)
    x = x[I, :]
    y = y[I]
    data = data_class.Data()
    data.x = x
    data.y = y
    data.set_train()
    data.set_target()
    data.set_true_y()
    data.is_regression = True

    helper_functions.save_object(drosophila_file, data)
def create_time_series(
    label_to_use=0, series_to_use=0, num_instances=None, normalize_x=False, save_data=True, name="CO2_emissions"
):
    file = name + "/processed_data.pkl"
    all_data = []
    for i in series_to_use:
        y, ids = helper_functions.load_object(file)
        y_to_use = y[:, i, :]
        print str(i) + ": " + ids[i]
        data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[i]]))
        data.is_regression = True
        data.keep_series(label_to_use)
        data = data.get_min_range()
        data.smooth_missing()
        data = data.get_nth(7)
        data.reset_x()
        data.x = data.x.astype(np.float)
        if num_instances is not None:
            data = data.get_range([0, num_instances])
        data = data.get_range([1000, 1500])
        if normalize_x:
            data.x -= data.x.min()
            data.x /= data.x.max()
        data = data.create_data_instance()
        try:
            if len(series_to_use) > 1:
                data.data_set_ids[:] = i
        except:
            pass
        all_data.append(data)
        # perc_used = data.get_perc_used()
    data = all_data[0]
    del all_data[0]
    for di in all_data:
        data.combine(di)
    if num_instances is not None:
        pass
        s = name + "-%s-%d" % (str(series_to_use), num_instances)
    else:
        s = name + "-%s" % str(series_to_use)
    if normalize_x:
        s += "-norm"
    s += "/raw_data.pkl"
    # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10)
    array_functions.plot_2d_sub(data.x, data.y, data_set_ids=data.data_set_ids, title=None, sizes=10)
    if save_data:
        helper_functions.save_object(s, data)
Exemple #24
0
    def run_experiments(self):
        data_file = self.configs.data_file
        data_and_splits = self.load_data_and_splits(data_file)
        results_file = self.configs.results_file
        comm = mpi_utility.get_comm()
        if os.path.isfile(results_file):
            if mpi_utility.is_group_master():
                print results_file + ' already exists - skipping'
            return            
        if mpi_utility.is_group_master():
            hostname = helper_functions.get_hostname()
            print '(' + hostname  + ') Running experiments: ' + results_file
        learner = self.configs.learner
        learner.run_pre_experiment_setup(data_and_splits)
        num_labels = len(self.configs.num_labels)
        num_splits = self.configs.num_splits
        #method_results = results.MethodResults(n_exp=num_labels, n_splits=num_splits)
        method_results = self.configs.method_results_class(n_exp=num_labels, n_splits=num_splits)
        for i, nl in enumerate(self.configs.num_labels):
            method_results.results_list[i].num_labels = nl

        split_idx = self.configs.split_idx
        if split_idx is not None:
            num_labels_list = list(itertools.product(range(num_labels), [split_idx]))
        else:
            num_labels_list = list(itertools.product(range(num_labels), range(num_splits)))

        shared_args = (self, results_file, data_and_splits, method_results)
        args = [shared_args + (i_labels, split) for i_labels,split in num_labels_list]
        if self.configs.use_pool:
            pool = multiprocessing_utility.LoggingPool(processes=self.configs.pool_size)
            all_results = pool.map(_run_experiment, args)
        else:
            all_results = [_run_experiment(a) for a in args]
        for curr_results,s in zip(all_results,num_labels_list):
            if curr_results is None:
                continue
            i_labels, split = s
            method_results.set(curr_results, i_labels, split)

        method_results.configs = self.configs
        if self.configs.should_load_temp_data:
            helper_functions.save_object(results_file,method_results)
            for i_labels, split in num_labels_list:
                num_labels = self.configs.num_labels[i_labels]
                _delete_temp_split_files(results_file, num_labels, split)
            _delete_temp_folder(results_file)
def create_synthetic_classification(file_dir='', local=True):
    dim = 1
    n_target = 200
    n_source = 200
    n = n_target + n_source
    data = data_class.Data()
    data.x = np.random.uniform(0, 1, (n, dim))
    data.data_set_ids = np.zeros(n)
    data.data_set_ids[n_target:] = 1
    data.y = np.zeros(n)
    x, ids = data.x, data.data_set_ids
    I = array_functions.in_range(x, 0, .25)
    I2 = array_functions.in_range(x, .25, .5)
    I3 = array_functions.in_range(x, .5, .75)
    I4 = array_functions.in_range(x, .75, 1)
    id0 = ids == 0
    id1 = ids == 1
    data.y[I & id0] = 1
    data.y[I2 & id0] = 2
    data.y[I3 & id0] = 1
    data.y[I4 & id0] = 2

    data.y[I & id1] = 3
    data.y[I2 & id1] = 4
    data.y[I3 & id1] = 3
    data.y[I4 & id1] = 4
    if local:
        data.y[I3 & id1] = 4
        data.y[I4 & id1] = 3
    data.set_true_y()
    data.set_train()
    data.is_regression = False
    noise_rate = 0
    #data.add_noise(noise_rate)
    data.add_noise(noise_rate, id0, np.asarray([1, 2]))
    data.add_noise(noise_rate, id1, np.asarray([3, 4]))
    s = synthetic_classification_file
    if local:
        s = synthetic_classification_local_file
    i = id1
    array_functions.plot_2d(data.x[i, :], data.y[i])
    if file_dir != '':
        s = file_dir + '/' + s
    helper_functions.save_object(s, data)
def create_synthetic_classification(file_dir="", local=True):
    dim = 1
    n_target = 200
    n_source = 200
    n = n_target + n_source
    data = data_class.Data()
    data.x = np.random.uniform(0, 1, (n, dim))
    data.data_set_ids = np.zeros(n)
    data.data_set_ids[n_target:] = 1
    data.y = np.zeros(n)
    x, ids = data.x, data.data_set_ids
    I = array_functions.in_range(x, 0, 0.25)
    I2 = array_functions.in_range(x, 0.25, 0.5)
    I3 = array_functions.in_range(x, 0.5, 0.75)
    I4 = array_functions.in_range(x, 0.75, 1)
    id0 = ids == 0
    id1 = ids == 1
    data.y[I & id0] = 1
    data.y[I2 & id0] = 2
    data.y[I3 & id0] = 1
    data.y[I4 & id0] = 2

    data.y[I & id1] = 3
    data.y[I2 & id1] = 4
    data.y[I3 & id1] = 3
    data.y[I4 & id1] = 4
    if local:
        data.y[I3 & id1] = 4
        data.y[I4 & id1] = 3
    data.set_true_y()
    data.set_train()
    data.is_regression = False
    noise_rate = 0
    # data.add_noise(noise_rate)
    data.add_noise(noise_rate, id0, np.asarray([1, 2]))
    data.add_noise(noise_rate, id1, np.asarray([3, 4]))
    s = synthetic_classification_file
    if local:
        s = synthetic_classification_local_file
    i = id1
    array_functions.plot_2d(data.x[i, :], data.y[i])
    if file_dir != "":
        s = file_dir + "/" + s
    helper_functions.save_object(s, data)
def create_synthetic_linear_regression(n=500, p=50, sigma=1, num_non_zero=None):
    data = data_class.Data()
    data.x = np.random.uniform(0, sigma, (n, p))
    w = np.random.normal(0, 1, p)
    # w = np.ones(p)
    if num_non_zero is not None:
        w[num_non_zero:] = 0
    data.y = data.x.dot(w)
    data.y += np.random.normal(0, sigma, n)
    data.is_regression = True
    data.set_true_y()
    data.set_train()
    suffix = str(n) + "-" + str(p) + "-" + str(sigma)
    data.metadata = dict()
    data.metadata["true_w"] = w.T
    if num_non_zero is not None:
        suffix += "-nnz=" + str(num_non_zero)
    s = synthetic_linear_reg_file % suffix
    helper_functions.save_object(s, data)
def create_synthetic_step_linear_transfer(file_dir=""):
    n_target = 100
    n_source = 100
    n = n_target + n_source
    sigma = 0.5
    data = data_class.Data()
    data.x = np.random.uniform(0, 1, (n, 1))
    data.data_set_ids = np.zeros(n)
    data.data_set_ids[n_target:] = 1
    data.y = np.reshape(data.x * 5, data.x.shape[0])
    data.y[(data.data_set_ids == 1) & (data.x[:, 0] >= 0.5)] += 4
    data.y += np.random.normal(0, sigma, n)
    data.set_defaults()
    data.is_regression = True
    array_functions.plot_2d(data.x, data.y, data.data_set_ids, title="Linear Step Data Set")
    s = synthetic_step_linear_transfer_file
    if file_dir != "":
        s = file_dir + "/" + s
    helper_functions.save_object(s, data)
def create_synthetic_hypothesis_transfer(n=500,
                                         p=50,
                                         kt=1,
                                         ks=1,
                                         sigma=1.0,
                                         sigma_s=.3):
    wt = np.random.normal(0, sigma, p)
    all_data, w_eff = create_synthetic_linear_classification(n=n,
                                                             p=p,
                                                             sigma=sigma,
                                                             w=wt)
    x = all_data.x
    all_data.data_set_ids = np.zeros(n)
    wt = w_eff
    data_set_counter = 1
    diffs = []
    is_target = array_functions.false(kt + ks)
    is_target[:kt] = True
    all_data.true_w = np.zeros((ks + kt + 1, p))
    all_data.true_w[0, :] = wt
    for i, val in enumerate(is_target):
        data_set_id = data_set_counter
        data_set_counter += 1
        if val:
            ws = wt + np.random.normal(0, sigma_s, p)
            ws = wt
        else:
            ws = np.random.normal(0, sigma, p)
        source_data, ws = create_synthetic_linear_classification(w=ws, x=x)
        source_data.data_set_ids = data_set_id * np.ones(n)
        #source_data.true_y *= (i+2)
        source_data.y = source_data.true_y
        all_data.combine(source_data)
        diff = norm(wt / norm(wt) - ws / norm(ws))
        diffs.append(diff)
        all_data.true_w[data_set_id, :] = ws
    all_data.true_w = all_data.true_w.T
    all_data.metadata = dict()
    all_data.metadata['true_w'] = all_data.true_w
    s = synthetic_hypothesis_transfer_class_file % \
        (str(n) + '-' + str(p) + '-' + str(sigma) + '-' + str(sigma_s) + '-' + str(kt) + '-' + str(ks))
    helper_functions.save_object(s, all_data)
def create_drought(label_to_use=0,
                   series_to_use=0,
                   num_instances=None,
                   normalize_x=False,
                   save_data=True):
    file = 'drought/processed_data.pkl'
    y, ids = helper_functions.load_object(file)
    y_to_use = y[:, series_to_use, :]
    print str(series_to_use) + ': ' + ids[series_to_use]
    data = data_class.TimeSeriesData(y_to_use,
                                     np.asarray([ids[series_to_use]]))
    data.is_regression = True
    data.keep_series(label_to_use)
    data = data.get_min_range()
    data.smooth_missing()
    data.x = data.x.astype(np.float)
    if num_instances is not None:
        data = data.get_range([0, num_instances])
    if normalize_x:
        data.x -= data.x.min()
        data.x /= data.x.max()
    data = data.create_data_instance()
    # perc_used = data.get_perc_used()
    if num_instances is not None:
        pass
        s = 'drought-%d-%d' % (series_to_use, num_instances)
    else:
        s = 'drought-%d' % series_to_use
    if normalize_x:
        s += '-norm'
    s += '/raw_data.pkl'
    # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10)
    array_functions.plot_2d_sub(data.x,
                                data.y,
                                data_set_ids=data.data_set_ids,
                                title=None,
                                sizes=10)
    if save_data:
        helper_functions.save_object(s, data)
def create_synthetic_step_transfer(file_dir='', dim=1):
    n_target = 100
    n_source = 100
    n = n_target + n_source
    sigma = .5
    data = data_class.Data()
    data.x = np.random.uniform(0, 1, (n, dim))
    data.data_set_ids = np.zeros(n)
    data.data_set_ids[n_target:] = 1
    data.y = np.zeros(n)
    data.y[(data.data_set_ids == 0) & (data.x[:, 0] >= .5)] = 2
    data.y += np.random.normal(0, sigma, n)
    data.set_defaults()
    data.is_regression = True
    if dim == 1:
        array_functions.plot_2d(data.x, data.y, data.data_set_ids)
    s = synthetic_step_transfer_file
    if dim > 1:
        s = synthetic_step_kd_transfer_file % dim
    if file_dir != '':
        s = file_dir + '/' + s
    helper_functions.save_object(s, data)
def create_synthetic_linear_regression(n=500,
                                       p=50,
                                       sigma=1,
                                       num_non_zero=None):
    data = data_class.Data()
    data.x = np.random.uniform(0, sigma, (n, p))
    w = np.random.normal(0, 1, p)
    #w = np.ones(p)
    if num_non_zero is not None:
        w[num_non_zero:] = 0
    data.y = data.x.dot(w)
    data.y += np.random.normal(0, sigma, n)
    data.is_regression = True
    data.set_true_y()
    data.set_train()
    suffix = str(n) + '-' + str(p) + '-' + str(sigma)
    data.metadata = dict()
    data.metadata['true_w'] = w.T
    if num_non_zero is not None:
        suffix += '-nnz=' + str(num_non_zero)
    s = synthetic_linear_reg_file % suffix
    helper_functions.save_object(s, data)
def create_synthetic_step_linear_transfer(file_dir=''):
    n_target = 100
    n_source = 100
    n = n_target + n_source
    sigma = .5
    data = data_class.Data()
    data.x = np.random.uniform(0, 1, (n, 1))
    data.data_set_ids = np.zeros(n)
    data.data_set_ids[n_target:] = 1
    data.y = np.reshape(data.x * 5, data.x.shape[0])
    data.y[(data.data_set_ids == 1) & (data.x[:, 0] >= .5)] += 4
    data.y += np.random.normal(0, sigma, n)
    data.set_defaults()
    data.is_regression = True
    array_functions.plot_2d(data.x,
                            data.y,
                            data.data_set_ids,
                            title='Linear Step Data Set')
    s = synthetic_step_linear_transfer_file
    if file_dir != '':
        s = file_dir + '/' + s
    helper_functions.save_object(s, data)
def create_boston_housing(file_dir=""):
    boston_data = datasets.load_boston()
    data = data_class.Data()
    data.x = boston_data.data
    data.y = boston_data.target
    data.feature_names = list(boston_data.feature_names)

    data.set_train()
    data.set_target()
    data.set_true_y()
    data.is_regression = True
    s = boston_housing_raw_data_file
    x = data.x
    y = data.y
    if create_transfer_data:
        x_ind = 5
        domain_ind = 12
        domain_ids = np.ones(x.shape[0])
        domain_ids = array_functions.bin_data(x[:, domain_ind], num_bins=4)
        x = np.delete(x, domain_ind, 1)
        # viz_features(x,y,domain_ids,boston_data.feature_names)
        data.data_set_ids = domain_ids

        if boston_num_feats == 1:
            data.x = data.x[:, x_ind]
            data.x = array_functions.vec_to_2d(data.x)
            s = s % ""
        elif boston_num_feats >= data.x.shape[1]:
            data.x = array_functions.standardize(data.x)
            p = min(boston_num_feats, data.x.shape[1])
            s = s % ("-" + str(p))
        else:
            assert False
    else:
        s %= ""
    if file_dir != "":
        s = file_dir + "/" + s
    helper_functions.save_object(s, data)
def create_bike_sharing():
    file = 'bike_sharing/day.csv'
    columns = [0] + range(2, 16)
    all_field_names = pd.read_csv(file, nrows=1, dtype='string')
    all_field_names = np.asarray(all_field_names.keys())
    used_field_names = all_field_names[columns]
    bike_data = np.loadtxt(file, skiprows=1, delimiter=',', usecols=columns)
    domain_ind = used_field_names == 'yr'
    domain_ids = np.squeeze(bike_data[:, domain_ind])
    #inds_to_keep = (used_field_names == 'temp') | (used_field_names == 'atemp')
    #bike_data = bike_data[:,inds_to_keep]
    #used_field_names = used_field_names[inds_to_keep]

    viz = True
    to_use = np.asarray([8, 9, 10, 11])
    x = bike_data[:, to_use]
    used_field_names = used_field_names[to_use]
    y = bike_data[:, -1]
    if viz:
        #learner = make_learner()
        learner = None
        viz_features(x, y, domain_ids, used_field_names, learner=learner)
    field_to_use = 1
    x = x[:, field_to_use]

    data = data_class.Data()
    data.is_regression = True
    data.x = array_functions.vec_to_2d(x)
    data.x = array_functions.standardize(data.x)
    data.y = y
    data.y = array_functions.normalize(data.y)
    data.set_defaults()
    data.data_set_ids = domain_ids

    s = bike_file % ('-feat=' + str(field_to_use))
    helper_functions.save_object(s, data)

    pass
def create_bike_sharing():
    file = "bike_sharing/day.csv"
    columns = [0] + range(2, 16)
    all_field_names = pd.read_csv(file, nrows=1, dtype="string")
    all_field_names = np.asarray(all_field_names.keys())
    used_field_names = all_field_names[columns]
    bike_data = np.loadtxt(file, skiprows=1, delimiter=",", usecols=columns)
    domain_ind = used_field_names == "yr"
    domain_ids = np.squeeze(bike_data[:, domain_ind])
    # inds_to_keep = (used_field_names == 'temp') | (used_field_names == 'atemp')
    # bike_data = bike_data[:,inds_to_keep]
    # used_field_names = used_field_names[inds_to_keep]

    viz = True
    to_use = np.asarray([8, 9, 10, 11])
    x = bike_data[:, to_use]
    used_field_names = used_field_names[to_use]
    y = bike_data[:, -1]
    if viz:
        # learner = make_learner()
        learner = None
        viz_features(x, y, domain_ids, used_field_names, learner=learner)
    field_to_use = 1
    x = x[:, field_to_use]

    data = data_class.Data()
    data.is_regression = True
    data.x = array_functions.vec_to_2d(x)
    data.x = array_functions.standardize(data.x)
    data.y = y
    data.y = array_functions.normalize(data.y)
    data.set_defaults()
    data.data_set_ids = domain_ids

    s = bike_file % ("-feat=" + str(field_to_use))
    helper_functions.save_object(s, data)

    pass
Exemple #37
0
    pl.title('Values 2')
    array_functions.move_fig(fig1, 500, 500, 2000, 100)
    array_functions.move_fig(fig2, 500, 500, 2600, 100)
    pl.show(block=True)

    data = (x, y)
    x = np.vstack((x[I1, :], x[I2, :]))
    data_set_ids = np.hstack((np.zeros(I1.sum()), np.ones(I2.sum())))

    y = np.hstack((y[I1], y[I2]))

    data = data_lib.Data(x, y)
    data.x[:, 0] = array_functions.normalize(data.x[:, 0])
    data.x[:, 1] = array_functions.normalize(data.x[:, 1])
    data.data_set_ids = data_set_ids
    print 'n-all: ' + str(data.y.size)
    if save_data:
        s = '../kc-housing-spatial'
        if suffix != '':
            s += '-' + suffix
        helper_functions.save_object(s + '/raw_data.pkl', data)

else:
    feats_to_clear = ['id', 'date', 'yr_renovated', 'zipcode', 'lat', 'long']
    clear_idx = array_functions.find_set(feat_names, feats_to_clear + [y_name])
    x = data[:, ~clear_idx]
    x = array_functions.remove_quotes(x)
    x = x.astype(np.float)
    data = (x, y)
    helper_functions.save_object('processed_data.pkl', data)
    x = np.vstack((day_locs, night_locs))
    '''
    for i in range(x.shape[1]):
        x[:,i] = x[:,i] / x[:,i].max()
    '''
    data_set_ids = np.hstack(
        (np.zeros(day_values.size), np.ones(day_values.size)))

    y = np.hstack((day_values, night_values))
    '''
    if use_alternate:
        I = np.isfinite(y) & (y > 0)
    else:
        I = np.isfinite(y) & (y > 0) & (y > np.log(5))
    '''
    #I[~np.isfinite(y)] = 0
    I = np.isfinite(y)
    I &= array_functions.in_range(y, min_value, max_value)
    if just_center_data:
        I = I & in_range(x[:, 0], .2, .8) & in_range(x[:, 1], .2, .8)

    data = data_lib.Data(x[I, :], y[I])
    data.data_set_ids = data_set_ids[I]
    print 'n: ' + str(data.n)
    print 'n0: ' + str((data.data_set_ids == 0).sum())
    print 'n1: ' + str((data.data_set_ids == 1).sum())
    if save_data:
        pass
        file_path = '../taxi%s/raw_data.pkl' % suffix
        helper_functions.save_object(file_path, data)
    print ''
def load_taxi_data(num_files_to_load=np.inf,
                   num_bins=50,
                   use_alternate=True,
                   return_coords=False):
    all_files = [
        f for f in os.listdir(data_dir) if path.isfile(path.join(data_dir, f))
    ]
    x = []
    y = []
    time = []
    has_passenger = []
    #combined_data_file = 'combined_data.pkl'
    combined_data_file = 'C:/PythonFramework/data_sets/taxi/combined_data.pkl'
    if path.exists(combined_data_file):
        print 'loading combined data...'
        all_data = helper_functions.load_object(combined_data_file)
        print 'done loading data'
    else:
        for i, file in enumerate(all_files):
            if i == num_files_to_load:
                break
            if i >= 535:
                break
            file_data = load_csv(path.join(data_dir, file),
                                 has_field_names=False,
                                 delim=str(' '))[1]
            y.append(file_data[:, 0])
            x.append(file_data[:, 1])
            has_passenger.append(file_data[:, 2])
            time.append(file_data[:, 3])
            print i
        all_data = {
            'x': x,
            'y': y,
            'has_passenger': has_passenger,
            'time': time
        }
        print 'saving combined data...'
        helper_functions.save_object(combined_data_file, all_data)
    x = all_data['x']
    y = all_data['y']
    has_passenger = all_data['has_passenger']
    time = all_data['time']
    x_all = np.concatenate(x)
    y_all = np.concatenate(y)
    time_all = np.concatenate(time)

    has_passenger_all = np.concatenate(has_passenger)

    pickup_inds = get_pickup_inds(x_all, y_all, time_all, has_passenger_all)
    if just_pickup:
        x_all = x_all[pickup_inds]
        y_all = y_all[pickup_inds]
        has_passenger_all = has_passenger_all[pickup_inds]
        time_all = time_all[pickup_inds]
    #x_bounds = [-122.45677419354838, -122.38322580645161]
    #y_bounds = [37.738054968287521, 37.816543340380548]

    x_bounds = [-122.48, -122.35]
    y_bounds = [37.7, 37.84]

    #x_bounds = [-np.inf, np.inf]
    #y_bounds = x_bounds
    is_in_range = in_range(x_all, *x_bounds) & in_range(y_all, *y_bounds)
    x_all = x_all[is_in_range]
    y_all = y_all[is_in_range]
    x_all = quantize_loc(x_all, num_bins)
    y_all = quantize_loc(y_all, num_bins)
    time_all = time_all[is_in_range]

    hours = 9 * np.ones(time_all.shape)

    get_hour_vec = np.vectorize(get_hour)
    hours = get_hour_vec(time_all)
    '''
    get_day_vec = np.vectorize(get_day)
    days = get_day_vec(time_all)
    '''
    has_passenger_all = has_passenger_all[is_in_range]

    suffix = '3'
    is_morning = (hours == 9)
    is_night = (hours == 18)
    #is_morning = (hours == 6) & (days == 21)
    #is_night = (hours == 18) & (days == 21)
    #is_morning = (days == 21)
    #is_night = (days == 24)
    if use_alternate:
        is_morning = (hours >= 5) & (hours <= 12)
        is_night = (hours >= 17)
        #is_morning = days == 21
        #is_night = days == 24
        #is_morning = (has_passenger_all == 1) & (days == 21) & is_morning
        #is_night = (has_passenger_all == 1) & (days == 21) & is_night
        #is_morning = (has_passenger_all == 1) & (hours == 6)
        #is_night = (has_passenger_all == 1) & (hours == 18)
        suffix = '2'

    suffix += '-' + str(num_bins)
    #print np.unique(days)

    #is_morning = days == 4
    #is_night = days == 8

    day_locs, day_values = count_cars(x_all[is_morning], y_all[is_morning],
                                      num_bins)
    night_locs, night_values = count_cars(x_all[is_night], y_all[is_night],
                                          num_bins)
    if return_coords:
        day_locs = bin_to_coordinates(day_locs, x_bounds, y_bounds, num_bins)
        night_locs = bin_to_coordinates(night_locs, x_bounds, y_bounds,
                                        num_bins)
    '''
    if use_alternate:
        I = (day_values > 0) | (night_values > 0)
        I = I & (day_values > 0) & (night_values > 0)
    else:
        I = (day_values > 5) | (night_values > 5)
        I = I & (day_values > 0) & (night_values > 0)
    relative_diff = np.max(day_values[I] - night_values[I]) / day_values[I]
    '''
    #array_functions.plot_heatmap(day_locs[I], relative_diff, sizes=10, alpha=1, subtract_min=False)
    return day_locs, day_values, night_locs, night_values, suffix
Exemple #40
0
            for i, s in enumerate(unique_series_ids[is_in_state]):
                print str(i) + ': ' + s
            array_functions.plot_2d_sub_multiple_y(np.asarray(x_val),
                                                   y_val,
                                                   title=None,
                                                   sizes=10)
    else:
        for i in range(times_series_vals.shape[1]):
            y_val = times_series_vals[:, i, :]
            x_val = np.arange(y_val.shape[0])
            if not np.isfinite(y_val).sum(0).all():
                print 'skipping - missing labels'
                continue
            print unique_series_ids[i]
            array_functions.plot_2d_sub_multiple_y(np.asarray(x_val),
                                                   y_val,
                                                   title=unique_series_ids[i],
                                                   sizes=10)

data = (unique_locs, times_series_vals[:, :, y_to_use], unique_series_ids)
suffix = str(y_names[y_to_use])
if use_monthly:
    suffix += '-month'
s = '../climate'
if use_monthly:
    s += '-month'
s += '/processed_data-' + suffix + '.pkl'
helper_functions.save_object(s, data)

pass
def create_synthetic_slant_transfer():
    target_fun = lambda x: 2 * x
    source_fun = lambda x: 2.5 * x + 1
    data = create_synthetic_regression_transfer(target_fun, source_fun)
    s = synthetic_slant_file
    helper_functions.save_object(s, data)
def create_synthetic_curve_transfer():
    target_fun = lambda x: x**2
    source_fun = lambda x: x**2.5 + 1
    data = create_synthetic_regression_transfer(target_fun, source_fun)
    s = synthetic_curve_file
    helper_functions.save_object(s, data)
    '''
    for j in I:
        print date_strs[j]
    '''
    print 'num_items: ' + str(I.size)
    print 'start: ' + date_strs[I[0]]
    print 'end: ' + date_strs[I[-1]]


times_series_vals[times_series_vals < 0] = np.nan
'''
#for state in unique_states:
for state in unique_series_ids:
    is_in_state = np.asarray([s.find(state) == 0 for s in unique_series_ids])
    is_in_state = is_in_state.nonzero()[0]
    if is_in_state.size > 8:
        is_in_state = is_in_state[:8]
    #y_val = times_series_vals[is_in_state, :800, 1].T
    y_val = times_series_vals[is_in_state[0], :2000, :4]
    x_val = range(y_val.shape[0])
    #print unique_series_ids[to_use]
    for i, s in enumerate(unique_series_ids[is_in_state]):
        print str(i) + ': ' + s
    array_functions.plot_2d_sub_multiple_y(np.asarray(x_val), y_val, title=None, sizes=10)
'''

data = (times_series_vals,unique_series_ids)
helper_functions.save_object('processed_data.pkl', data)

pass
def create_20ng_data(file_dir=''):
    newsgroups_train = datasets.fetch_20newsgroups(subset='train',
                                                   remove=('headers',
                                                           'footers',
                                                           'quotes'))
    data = data_class.Data()
    short_names = [
        #0
        'A',
        #1-5
        'C1',
        'C2',
        'C3',
        'C4',
        'C5',
        #6
        'M',
        #7-10
        'R1',
        'R2',
        'R3',
        'R4',
        #11-14
        'S1',
        'S2',
        'S3',
        'S4',
        #15
        'O',
        #16-19
        'T1',
        'T2',
        'T3',
        'T4'
    ]
    y = newsgroups_train.target
    #l = [1,2,7,8,12,17]
    #l = [1,2,7,8,12,13]
    #l = [0,1,2,3,4,5,7,8,9,10,11,12,13,14,16,17,18,19]
    l = [0, 1, 2, 7, 8, 11, 12, 16, 17]
    #l = [0, 1, 2, 3, 4, 7, 8, 9, 10,11,12,13,14,16,17,18,19]
    data.label_names = [short_names[i] for i in l]
    I = array_functions.false(len(newsgroups_train.target))
    for i in l:
        I = I | (y == i)
    #I = y == 1 | y == 2 | y == 7 | y == 7 | y == 11 | y == 16
    I = I.nonzero()[0]
    max_df = .5
    min_df = .01
    #max_df = .95
    #min_df = .001
    #max_df = .1
    #min_df = .01
    newsgroups_train.data = [newsgroups_train.data[i] for i in I]
    newsgroups_train.target = newsgroups_train.target[I]
    tf_idf = TfidfVectorizer(stop_words='english',
                             max_df=max_df,
                             min_df=min_df,
                             max_features=max_features)
    vectors = tf_idf.fit_transform(newsgroups_train.data)
    feature_counts = (vectors > 0).sum(0)
    vocab = helper_functions.invert_dict(tf_idf.vocabulary_)
    num_feats = len(vocab)
    vocab = [vocab[i] for i in range(num_feats)]

    #pca = PCA(n_components=pca_feats)
    #v2 = pca.fit_transform(vectors.toarray())
    v2 = vectors.toarray()
    vectors = v2

    y = newsgroups_train.target.copy()
    '''
    y[y==7] = 1
    y[(y==2) | (y==8)] = 2
    y[(y==12) | (y==17)] = 3
    '''
    '''
    y[y == 2] = 1
    y[(y==7) | (y==8)] = 2
    y[(y==12) | (y==13)] = 3
    #I_f = (y==1) | (y==7) | (y==11) | (y==16)
    I_f = array_functions.true(vectors.shape[0])
    f = f_classif
    k_best = SelectKBest(score_func=f, k=pca_feats)
    v2 = k_best.fit_transform(vectors[I_f,:], y[I_f])
    k_best.transform(vectors)
    s = k_best.get_support()
    selected_vocab = [vocab[i] for i in s.nonzero()[0]]
    vocab = selected_vocab
    vectors = v2
    '''

    data.x = vectors
    data.y = newsgroups_train.target
    data.set_train()
    data.set_target()
    data.set_true_y()
    data.is_regression = False
    data.feature_names = vocab
    class_counts = array_functions.histogram_unique(data.y)
    s = ng_raw_data_file
    if file_dir != '':
        s = file_dir + '/' + s
    helper_functions.save_object(s, data)
def create_20ng_data(file_dir=""):
    newsgroups_train = datasets.fetch_20newsgroups(subset="train", remove=("headers", "footers", "quotes"))
    data = data_class.Data()
    short_names = [
        # 0
        "A",
        # 1-5
        "C1",
        "C2",
        "C3",
        "C4",
        "C5",
        # 6
        "M",
        # 7-10
        "R1",
        "R2",
        "R3",
        "R4",
        # 11-14
        "S1",
        "S2",
        "S3",
        "S4",
        # 15
        "O",
        # 16-19
        "T1",
        "T2",
        "T3",
        "T4",
    ]
    data.label_names = short_names
    y = newsgroups_train.target
    l = [1, 2, 7, 8, 12, 17]
    # l = [1,2,7,8,12,13]
    I = array_functions.false(len(newsgroups_train.target))
    for i in l:
        I = I | (y == i)
    # I = y == 1 | y == 2 | y == 7 | y == 7 | y == 11 | y == 16
    I = I.nonzero()[0]
    max_df = 0.95
    min_df = 0.001
    # max_df = .1
    # min_df = .01
    newsgroups_train.data = [newsgroups_train.data[i] for i in I]
    newsgroups_train.target = newsgroups_train.target[I]
    tf_idf = TfidfVectorizer(stop_words="english", max_df=max_df, min_df=min_df, max_features=max_features)
    vectors = tf_idf.fit_transform(newsgroups_train.data)
    feature_counts = (vectors > 0).sum(0)
    vocab = helper_functions.invert_dict(tf_idf.vocabulary_)
    num_feats = len(vocab)
    vocab = [vocab[i] for i in range(num_feats)]

    pca = PCA(n_components=pca_feats)
    v2 = pca.fit_transform(vectors.toarray())
    vectors = v2

    y = newsgroups_train.target.copy()
    """
    y[y==7] = 1
    y[(y==2) | (y==8)] = 2
    y[(y==12) | (y==17)] = 3
    """
    """
    y[y == 2] = 1
    y[(y==7) | (y==8)] = 2
    y[(y==12) | (y==13)] = 3
    #I_f = (y==1) | (y==7) | (y==11) | (y==16)
    I_f = array_functions.true(vectors.shape[0])
    f = f_classif
    k_best = SelectKBest(score_func=f, k=pca_feats)
    v2 = k_best.fit_transform(vectors[I_f,:], y[I_f])
    k_best.transform(vectors)
    s = k_best.get_support()
    selected_vocab = [vocab[i] for i in s.nonzero()[0]]
    vocab = selected_vocab
    vectors = v2
    """

    data.x = vectors
    data.y = newsgroups_train.target
    data.set_defaults()
    data.is_regression = False
    data.feature_names = vocab
    class_counts = array_functions.histogram_unique(data.y)
    s = ng_raw_data_file
    if file_dir != "":
        s = file_dir + "/" + s
    helper_functions.save_object(s, data)
def create_wine(data_to_create=WINE_RED):
    red_file = 'wine/winequality-red.csv'
    white_file = 'wine/winequality-white.csv'
    field_names, red_data = load_csv(red_file, delim=';')
    white_data = load_csv(white_file, delim=';')[1]

    if data_to_create == WINE_TRANSFER:
        red_ids = np.zeros((red_data.shape[0], 1))
        white_ids = np.ones((white_data.shape[0], 1))
        red_data = np.hstack((red_data, red_ids))
        white_data = np.hstack((white_data, white_ids))
        wine_data = np.vstack((red_data, white_data))

        ids = wine_data[:, -1]
        x = wine_data[:, :-2]
        y = wine_data[:, -2]
        used_field_names = field_names[:-1]
        viz = True
        if viz:
            learner = make_learner()
            #learner = None
            viz_features(x,
                         y,
                         ids,
                         used_field_names,
                         alpha=.01,
                         learner=learner)
        suffix = 'transfer'
    else:
        if data_to_create == WINE_RED:
            wine_data = red_data
            suffix = 'red'
        elif data_to_create == WINE_WHITE:
            wine_data = white_data
            suffix = 'white'
        else:
            assert False

        ids = None
        x = wine_data[:, :-1]
        y = wine_data[:, -1]
        used_field_names = field_names[:-1]
    data = data_class.Data()
    data.x = data.x = array_functions.standardize(x)
    if data_to_create == WINE_TRANSFER:
        pass
        #feat_idx = 1
        #data.x = array_functions.vec_to_2d(x[:,feat_idx])

    data.y = y
    data.set_train()
    data.set_target()
    data.set_true_y()
    data.data_set_ids = ids
    data.is_regression = True
    '''
    data = data.rand_sample(.25, data.data_set_ids == 0)
    data = data.rand_sample(.1, data.data_set_ids == 1)
    s = wine_file % ('-small-' + str(data.p))
    '''
    s = wine_file % ('-' + suffix)
    helper_functions.save_object(s, data)
Exemple #47
0
def run_main():
    import caffe
    adience_caffe_model_dir = 'C:\\Users\\Aubrey\\Desktop\\cnn_age_gender_models_and_data.0.0.2\\'

    age_net_pretrained = '/age_net.caffemodel'
    age_net_model_file = '/deploy_age.prototxt'

    age_net = caffe.Classifier(adience_caffe_model_dir + age_net_model_file,
                               adience_caffe_model_dir + age_net_pretrained,
                               channel_swap=(2, 1, 0),
                               raw_scale=255,
                               image_dims=(256, 256))

    age_list = [
        '(0, 2)', '(4, 6)', '(8, 12)', '(15, 20)', '(25, 32)', '(38, 43)',
        '(48, 53)', '(60, 100)'
    ]

    adience_image_dir = 'C:\\Users\\Aubrey\\Desktop\\adience_aligned\\aligned\\'
    adience_metadata_file = 'C:\\Users\\Aubrey\\Desktop\\adience_aligned\\alined_metadata\\all_photos.csv'

    metadata = create_data_set.load_csv(
        adience_metadata_file,
        dtype='string',
        delim='\t',
    )

    column_names = metadata[0].tolist()
    photo_data = metadata[1]
    face_id_col = column_names.index('face_id')
    user_id_col = column_names.index('user_id')
    image_name_col = column_names.index('original_image')
    age_col = column_names.index('age')
    x = np.zeros((photo_data.shape[0], 512))
    y = np.zeros((photo_data.shape[0]))
    id = np.zeros((photo_data.shape[0]))
    i = 0
    last_perc_done = 0
    for idx, row in enumerate(photo_data):
        perc_done = math.floor(100 * float(idx) / len(photo_data))
        if perc_done > last_perc_done:
            last_perc_done = perc_done
            print str(perc_done) + '% done'
        image_dir = adience_image_dir + row[user_id_col] + '/'
        face_id = row[face_id_col]
        '''
        images_in_dir = os.listdir(image_dir)
        matching_images = [s for s in images_in_dir if s.find(row[image_name_col]) >= 0]
        assert len(matching_images) < 2
        if len(matching_images) == 0:
            print 'Skipping: ' + image
            continue
        '''
        image = image_dir + 'landmark_aligned_face.' + str(
            face_id) + '.' + row[image_name_col]
        if not os.path.isfile(image):
            print 'Skipping: ' + image
            continue
        input_image = caffe.io.load_image(image)
        age = row[age_col]
        blobs = ['fc7']
        features_age = predict_blobs(age_net, [input_image], blobs)
        x[i, :] = features_age
        y[i] = extract_age(age)
        id[i] = float(face_id)
        i += 1
    data = data_class.Data()
    data.x = x
    data.instance_ids = id
    data.y = y
    data.is_regression = True
    data.set_train()
    data.set_target()
    data.set_true_y()
    data_file = create_data_set.adience_aligned_cnn_file
    helper_functions.save_object('data_sets/' + data_file, data)
    print 'TODO'
def create_synthetic_slant_transfer():
    target_fun = lambda x: 2 * x
    source_fun = lambda x: 2.5 * x + 1
    data = create_synthetic_regression_transfer(target_fun, source_fun)
    s = synthetic_slant_file
    helper_functions.save_object(s, data)
def create_boston_housing(file_dir=''):
    boston_data = datasets.load_boston()
    data = data_class.Data()
    data.x = boston_data.data
    data.y = boston_data.target
    data.feature_names = list(boston_data.feature_names)

    data.set_train()
    data.set_target()
    data.set_true_y()
    data.is_regression = True
    s = boston_housing_raw_data_file
    x = data.x
    y = data.y
    create_transfer_data = False
    create_y_split = True
    if create_y_split:
        from base import transfer_project_configs as configs_lib
        pc = configs_lib.ProjectConfigs()
        main_configs = configs_lib.MainConfigs(pc)
        learner = main_configs.learner
        learner.quiet = True
        learner.target_learner[0].quiet = True
        learner.source_learner.quiet = True
        learner.g_learner.quiet = False
        domain_ids = array_functions.bin_data(data.y, num_bins=2)
        data.data_set_ids = domain_ids
        data.is_train[:] = True
        corrs = []
        for i in range(x.shape[1]):
            corrs.append(scipy.stats.pearsonr(x[:, i], y)[0])
        learner.train_and_test(data)
        print 'Just playing with data - not meant to save it'
        for i, name in enumerate(data.feature_names):
            v = learner.g_learner.g[i]
            if abs(v) < 1e-6:
                v = 0
            print name + ': ' + str(v)
        exit()
    elif create_transfer_data:
        x_ind = 5
        domain_ind = 12
        domain_ids = np.ones(x.shape[0])
        domain_ids = array_functions.bin_data(x[:, domain_ind], num_bins=4)
        x = np.delete(x, domain_ind, 1)
        #viz_features(x,y,domain_ids,boston_data.feature_names)
        data.data_set_ids = domain_ids

        if boston_num_feats == 1:
            data.x = data.x[:, x_ind]
            data.x = array_functions.vec_to_2d(data.x)
            s = s % ''
        elif boston_num_feats >= data.x.shape[1]:
            data.x = array_functions.standardize(data.x)
            p = min(boston_num_feats, data.x.shape[1])
            s = s % ('-' + str(p))
        else:
            assert False
    else:
        s %= ''
    if file_dir != '':
        s = file_dir + '/' + s
    helper_functions.save_object(s, data)
def run_main():
    import caffe
    adience_caffe_model_dir = 'C:\\Users\\Aubrey\\Desktop\\cnn_age_gender_models_and_data.0.0.2\\'

    age_net_pretrained='/age_net.caffemodel'
    age_net_model_file='/deploy_age.prototxt'

    age_net = caffe.Classifier(adience_caffe_model_dir + age_net_model_file,
                               adience_caffe_model_dir + age_net_pretrained,
                               channel_swap=(2,1,0),
                               raw_scale=255,
                               image_dims=(256, 256))

    age_list=['(0, 2)','(4, 6)','(8, 12)','(15, 20)','(25, 32)','(38, 43)','(48, 53)','(60, 100)']

    adience_image_dir = 'C:\\Users\\Aubrey\\Desktop\\adience_aligned\\aligned\\'
    adience_metadata_file = 'C:\\Users\\Aubrey\\Desktop\\adience_aligned\\alined_metadata\\all_photos.csv'

    metadata = create_data_set.load_csv(adience_metadata_file,
                                         dtype='string',
                                         delim='\t',
                                         )

    column_names = metadata[0].tolist()
    photo_data = metadata[1]
    face_id_col = column_names.index('face_id')
    user_id_col = column_names.index('user_id')
    image_name_col = column_names.index('original_image')
    age_col = column_names.index('age')
    x = np.zeros((photo_data.shape[0], 512))
    y = np.zeros((photo_data.shape[0]))
    id = np.zeros((photo_data.shape[0]))
    i = 0
    last_perc_done = 0
    for idx, row in enumerate(photo_data):
        perc_done = math.floor(100 * float(idx) / len(photo_data))
        if perc_done > last_perc_done:
            last_perc_done = perc_done
            print str(perc_done) + '% done'
        image_dir = adience_image_dir + row[user_id_col] + '/'
        face_id = row[face_id_col]
        '''
        images_in_dir = os.listdir(image_dir)
        matching_images = [s for s in images_in_dir if s.find(row[image_name_col]) >= 0]
        assert len(matching_images) < 2
        if len(matching_images) == 0:
            print 'Skipping: ' + image
            continue
        '''
        image = image_dir + 'landmark_aligned_face.' + str(face_id) + '.' + row[image_name_col]
        if not os.path.isfile(image):
            print 'Skipping: ' + image
            continue
        input_image = caffe.io.load_image(image)
        age = row[age_col]
        blobs = ['fc7']
        features_age = predict_blobs(age_net,[input_image],blobs)
        x[i,:] = features_age
        y[i] = extract_age(age)
        id[i] = float(face_id)
        i += 1
    data = data_class.Data()
    data.x = x
    data.instance_ids = id
    data.y = y
    data.is_regression = True
    data.set_train()
    data.set_target()
    data.set_true_y()
    data_file = create_data_set.adience_aligned_cnn_file
    helper_functions.save_object('data_sets/' + data_file, data)
    print 'TODO'
def create_synthetic_curve_transfer():
    target_fun = lambda x: x ** 2
    source_fun = lambda x: x ** 2.5 + 1
    data = create_synthetic_regression_transfer(target_fun, source_fun)
    s = synthetic_curve_file
    helper_functions.save_object(s, data)
Exemple #52
0
import numpy as np
import scipy
from data_sets import create_data_set
from data import data as data_lib
from utility import helper_functions


file = 'SAheart.data.txt'
all_field_names, data = create_data_set.load_csv(file, has_field_names=True,dtype='string',delim=str(','))
data[data == 'Present'] = '1'
data[data == 'Absent'] = '0'
data = data[:, 1:]
data = data.astype(np.float)
data = data_lib.Data(data[:, :-1], data[:, -1])
data.set_train()
data.set_target()
helper_functions.save_object('raw_data.pkl', data)
print ''
            is_in_state = np.arange(i,i+10)
            #y_val = times_series_vals[is_in_state, :800, 1].T
            y_val = times_series_vals[:,is_in_state[:], y_to_plot]
            x_val = range(y_val.shape[0])
            #print unique_series_ids[to_use]
            for i, s in enumerate(unique_series_ids[is_in_state]):
                print str(i) + ': ' + s
            array_functions.plot_2d_sub_multiple_y(np.asarray(x_val), y_val, title=None, sizes=10)
    else:
        for i in range(times_series_vals.shape[1]):
            y_val = times_series_vals[:, i, :]
            x_val = np.arange(y_val.shape[0])
            if not np.isfinite(y_val).sum(0).all():
                print 'skipping - missing labels'
                continue
            print unique_series_ids[i]
            array_functions.plot_2d_sub_multiple_y(np.asarray(x_val), y_val, title=unique_series_ids[i], sizes=10)


data = (unique_locs, times_series_vals[:,:,y_to_use],unique_series_ids)
suffix = y_names[y_to_use]
if use_monthly:
    suffix += '-month'
s = '../climate'
if use_monthly:
    s += '-month'
s += '/processed_data.pkl'
helper_functions.save_object(s, data)

pass