Esempio n. 1
0
def create_diabetes():
    diabetes_data = datasets.load_diabetes()
    x = diabetes_data.data
    y = diabetes_data.target
    for i in range(x.shape[1]):
        xi = array_functions.normalize(x[:, i])
        yi = array_functions.normalize(y)
        array_functions.plot_2d(xi, yi)
        pass
    assert False
Esempio n. 2
0
def create_diabetes():
    diabetes_data = datasets.load_diabetes()
    x = diabetes_data.data
    y = diabetes_data.target
    for i in range(x.shape[1]):
        xi = array_functions.normalize(x[:, i])
        yi = array_functions.normalize(y)
        array_functions.plot_2d(xi, yi)
        pass
    assert False
Esempio n. 3
0
def create_digits():
    digits_data = datasets.load_digits()
    x = digits_data.data
    y = digits_data.target
    for i in range(x.shape[1]):
        xi = array_functions.normalize(x[:, i])
        yi = y
        array_functions.plot_2d(xi, yi, alpha=0.01)
        pass
    pass
Esempio n. 4
0
def create_digits():
    digits_data = datasets.load_digits()
    x = digits_data.data
    y = digits_data.target
    for i in range(x.shape[1]):
        xi = array_functions.normalize(x[:, i])
        yi = y
        array_functions.plot_2d(xi, yi, alpha=.01)
        pass
    pass
Esempio n. 5
0
def create_bike_sharing():
    file = "bike_sharing/day.csv"
    columns = [0] + range(2, 16)
    all_field_names = pd.read_csv(file, nrows=1, dtype="string")
    all_field_names = np.asarray(all_field_names.keys())
    used_field_names = all_field_names[columns]
    bike_data = np.loadtxt(file, skiprows=1, delimiter=",", usecols=columns)
    domain_ind = used_field_names == "yr"
    domain_ids = np.squeeze(bike_data[:, domain_ind])
    # inds_to_keep = (used_field_names == 'temp') | (used_field_names == 'atemp')
    # bike_data = bike_data[:,inds_to_keep]
    # used_field_names = used_field_names[inds_to_keep]

    viz = True
    to_use = np.asarray([8, 9, 10, 11])
    x = bike_data[:, to_use]
    used_field_names = used_field_names[to_use]
    y = bike_data[:, -1]
    if viz:
        # learner = make_learner()
        learner = None
        viz_features(x, y, domain_ids, used_field_names, learner=learner)
    field_to_use = 1
    x = x[:, field_to_use]

    data = data_class.Data()
    data.is_regression = True
    data.x = array_functions.vec_to_2d(x)
    data.x = array_functions.standardize(data.x)
    data.y = y
    data.y = array_functions.normalize(data.y)
    data.set_defaults()
    data.data_set_ids = domain_ids

    s = bike_file % ("-feat=" + str(field_to_use))
    helper_functions.save_object(s, data)

    pass
Esempio n. 6
0
def create_bike_sharing():
    file = 'bike_sharing/day.csv'
    columns = [0] + range(2, 16)
    all_field_names = pd.read_csv(file, nrows=1, dtype='string')
    all_field_names = np.asarray(all_field_names.keys())
    used_field_names = all_field_names[columns]
    bike_data = np.loadtxt(file, skiprows=1, delimiter=',', usecols=columns)
    domain_ind = used_field_names == 'yr'
    domain_ids = np.squeeze(bike_data[:, domain_ind])
    #inds_to_keep = (used_field_names == 'temp') | (used_field_names == 'atemp')
    #bike_data = bike_data[:,inds_to_keep]
    #used_field_names = used_field_names[inds_to_keep]

    viz = True
    to_use = np.asarray([8, 9, 10, 11])
    x = bike_data[:, to_use]
    used_field_names = used_field_names[to_use]
    y = bike_data[:, -1]
    if viz:
        #learner = make_learner()
        learner = None
        viz_features(x, y, domain_ids, used_field_names, learner=learner)
    field_to_use = 1
    x = x[:, field_to_use]

    data = data_class.Data()
    data.is_regression = True
    data.x = array_functions.vec_to_2d(x)
    data.x = array_functions.standardize(data.x)
    data.y = y
    data.y = array_functions.normalize(data.y)
    data.set_defaults()
    data.data_set_ids = domain_ids

    s = bike_file % ('-feat=' + str(field_to_use))
    helper_functions.save_object(s, data)

    pass
Esempio n. 7
0
                                            True,
                                            dtype='str',
                                            delim=',')
y_name = 'price'
y_ind = array_functions.find_first_element(feat_names, y_name)
y = data[:, y_ind].astype(np.float)
y /= 100000
suffix = ''
if create_geospatial_data:
    x_feats = ['long', 'lat']
    x_feat_inds = array_functions.find_set(feat_names, x_feats)
    x = data[:, x_feat_inds]
    x = array_functions.remove_quotes(x)
    x = x.astype(np.float)

    x[:, 0] = array_functions.normalize(x[:, 0])
    x[:, 1] = array_functions.normalize(x[:, 1])
    I = array_functions.is_in_percentile(x[:, 0], .01, .99)
    I &= array_functions.is_in_percentile(x[:, 1], .01, .99)
    x = x[I, :]
    y = y[I]
    data = data[I, :]

    if split_date:
        dates = array_functions.remove_quotes(data[:, feat_names == 'date'])
        date_objs = []
        for d in dates:
            date_obj = get_date(d)
            date_objs.append(date_obj)
        min_date = min(date_objs)
        day_deltas = np.zeros(len(date_objs))
    def predict(self, data):
        # d = data_lib.Data(np.expand_dims(data.source_y_pred, 1), data.y)
        y_pred_source = data.source_y_pred
        I = np.arange(y_pred_source.size)
        if self.predict_sample is not None and self.predict_sample < y_pred_source.size:
            I = np.random.choice(y_pred_source.size,
                                 self.predict_sample,
                                 replace=False)
        if self.use_rbf:
            #L = array_functions.make_laplacian(y_pred_source[I], self.sigma_tr)
            W_source_pred = array_functions.make_rbf(y_pred_source[I],
                                                     self.sigma_tr)
            if self.oracle_guidance is not None:
                y = data.true_y[I]

                n_y = y.size
                num_to_sample = math.ceil(self.oracle_guidance * n_y**2)
                rand_index1 = np.random.choice(n_y,
                                               int(num_to_sample),
                                               replace=True)
                rand_index2 = np.random.choice(n_y,
                                               int(num_to_sample),
                                               replace=True)
                if self.oracle_guidance_binary:
                    target_distances = array_functions.make_graph_distance(y)
                    distance_threshold = .2 * (y.max() - y.min())
                    W_source_pred[rand_index1, rand_index2] = target_distances[
                        rand_index1, rand_index2] <= distance_threshold
                    W_source_pred[rand_index2, rand_index1] = target_distances[
                        rand_index2, rand_index1] <= distance_threshold
                else:
                    y_scaled = array_functions.normalize(y) * (
                        y_pred_source.max() - y_pred_source.min())
                    W_oracle_pred = array_functions.make_rbf(
                        y_scaled, self.sigma_tr)
                    W_source_pred[rand_index1,
                                  rand_index2] = W_oracle_pred[rand_index1,
                                                               rand_index2]
                    W_source_pred[rand_index2,
                                  rand_index1] = W_oracle_pred[rand_index2,
                                                               rand_index1]
            W = array_functions.make_rbf(self.transform.transform(self.x),
                                         self.sigma_nw,
                                         x2=self.transform.transform(
                                             data.x[I, :])).T

        else:
            assert self.oracle_guidance is None
            k_L = int(self.sigma_tr * I.size)
            #L = array_functions.make_laplacian_kNN(y_pred_source[I], k_L)
            W_source_pred = array_functions.make_knn(y_pred_source[I], k_L)
            k_W = int(self.sigma_nw * self.x.shape[0])
            W = array_functions.make_knn(self.transform.transform(
                data.x[I, :]),
                                         k_W,
                                         x2=self.transform.transform(self.x))
        sparsify_prediction_graph = False
        if self.use_prediction_graph_radius:
            sparsify_prediction_graph = True
            W_sparse = array_functions.make_graph_radius(
                self.transform.transform(data.x[I, :]),
                radius=self.radius,
            )
        if self.use_prediction_graph_sparsification:
            sparsify_prediction_graph = True
            W_sparse = array_functions.make_knn(self.transform.transform(
                data.x[I, :]),
                                                self.k_sparsification,
                                                normalize_entries=False)
            #W_L = array_functions.make_knn(y_pred_source[I], k_L)
        if sparsify_prediction_graph:
            W_source_pred = W_source_pred * W_sparse
        S = array_functions.make_smoothing_matrix(W)
        timing_test = False
        C = self.C * self.x.shape[0] / W_source_pred[:].sum()
        if self.nystrom_percentage > 0 or timing_test:
            if timing_test:
                tic()
            Sy = S.dot(self.y)
            if C != 0:
                lamb = 1 / float(C)
                f = None
                tic()
                inv_approx, _ = array_functions.nystrom_woodbury_laplacian(
                    W_source_pred, lamb, self.nystrom_percentage)
                self.predict_time = toc()
                #_, f2 = array_functions.nystrom_woodbury_laplacian(W_source_pred, lamb, self.nystrom_percentage, v=Sy)
                if f is not None:
                    f *= lamb
                else:
                    inv_approx *= lamb
                    f = inv_approx.dot(Sy)
            else:
                f = Sy
            if timing_test:
                toc()
        if self.nystrom_percentage == 0 or self.nystrom_percentage is None or timing_test:
            if timing_test:
                tic()
            L = array_functions.make_laplacian_with_W(W_source_pred,
                                                      normalized=False)
            A = np.eye(I.size) + C * L
            try:
                tic()
                f = np.linalg.lstsq(A, S.dot(self.y))[0]
                self.predict_time = toc()
            except:
                print 'GraphTransferNW:predict failed, returning mean'
                f = self.y.mean() * np.ones(data.true_y.shape)
            if timing_test:
                toc()
        if timing_test:
            A_inv = np.linalg.inv(A)
            print 'approx error: ' + str(
                norm(inv_approx - A_inv) / norm(A_inv))
        o = results.Output(data)
        if self.predict_sample is not None:
            nw_data = data_lib.Data(data.x[I, :], f)
            self.nw_learner.train_and_test(nw_data)
            nw_output = self.nw_learner.predict(data)
            o.y = nw_output.y
            o.fu = nw_output.y
        else:
            o.y = f
            o.fu = f

        return o
Esempio n. 9
0

def to_date(date_str):
    a = date_str.split(' ')[0]
    a = a.split('/')
    month, day, year = [int(s) for s in a]
    d = datetime.date(year, month, day)
    return d


locs, y, ids = create_data_set.load_trip_data([file_name_apr, file_name_sep],
                                              None,
                                              'Date/Time',
                                              np.asarray(['Lon', 'Lat']),
                                              [100, 100],
                                              plot_data=True)
y[:, 0] /= y[:, 0].max()
y[:, 1] /= y[:, 1].max()
locs[:, 0] = array_functions.normalize(locs[:, 0])
locs[:, 1] = array_functions.normalize(locs[:, 1])

I = (y.sum(1) > 0)
locs = locs[I, :]
y = y[I, :]
ids = ids[I]

data = (locs, y, ids)
helper_functions.save_object('processed_data.pkl', data)

pass
Esempio n. 10
0
zipcodes.intersection_update(zipcode_locs.keys())
zipcodes.intersection_update(zipcode_housing.keys())

zipcode_array = np.zeros(len(zipcodes))
income_array = np.zeros(len(zipcodes))
locs = np.zeros((len(zipcodes), 2))
households = np.zeros(len(zipcodes))

for i, key in enumerate(zipcodes):
    zipcode_array[i] = key
    income_array[i] = zipcode_income[key]
    locs[i] = zipcode_locs[key]
    households[i] = zipcode_housing[key]

income_array = np.log(income_array)
income_array = array_functions.normalize(income_array)

households = array_functions.normalize(households)

locs[:, 0] = array_functions.normalize(locs[:, 0])
locs[:, 1] = array_functions.normalize(locs[:, 1])

#array_functions.plot_heatmap(locs, 10*income_array, sizes=50)
#array_functions.plot_heatmap(locs, households, sizes=50)
y = np.stack((income_array, households), 1)
print 'Num Used: ' + str(y.shape[0])
array_functions.plot_heatmap(locs, y, sizes=100, share_axis=True)
I = np.random.choice(y.shape[0], 400, replace=False)
data = (locs[I, :], y[I], zipcode_array[I])
helper_functions.save_object('processed_data.pkl', data)
Esempio n. 11
0
zipcodes.intersection_update(zipcode_locs.keys())
zipcodes.intersection_update(zipcode_housing.keys())

zipcode_array = np.zeros(len(zipcodes))
income_array = np.zeros(len(zipcodes))
locs = np.zeros((len(zipcodes), 2))
households = np.zeros(len(zipcodes))

for i, key in enumerate(zipcodes):
    zipcode_array[i] = key
    income_array[i] = zipcode_income[key]
    locs[i] = zipcode_locs[key]
    households[i] = zipcode_housing[key]

income_array = np.log(income_array)
income_array = array_functions.normalize(income_array)

households = array_functions.normalize(households)

locs[:,0] = array_functions.normalize(locs[:,0])
locs[:,1] = array_functions.normalize(locs[:,1])

#array_functions.plot_heatmap(locs, 10*income_array, sizes=50)
#array_functions.plot_heatmap(locs, households, sizes=50)
y = np.stack((income_array, households), 1)
print 'Num Used: ' + str(y.shape[0])
array_functions.plot_heatmap(
    locs,
    y,
    sizes=100,
    share_axis=True
Esempio n. 12
0
            target_errors[split_idx] = loss.compute_score(target_results)
            source_errors[split_idx] = loss.compute_score(source_results)
            stacking_errors[split_idx] = loss.compute_score(stacking_results)
        errors[state_idx, 0] = target_errors.mean()
        errors[state_idx, 1] = source_errors.mean()
        errors[state_idx, 2] = stacking_errors.mean()
        target_relative = (source_errors.mean() - target_errors.mean())/target_errors.mean()
        source_relative = (stacking_errors.mean() - target_errors.mean()) / stacking_errors.mean()
        print 'Target ' + s + ': '  + str(target_relative)
        print 'Stacking ' + s + ': ' + str(source_relative)
        if source_relative > target_relative:
            print '!!!'
    exit()
if not viz:
    for i in range(2):
        data.x[:,i] = array_functions.normalize(data.x[:,i])
print 'n: ' + str(data.n)
if viz:
    I1 = data.data_set_ids == 0
    I2 = data.data_set_ids == 1
    fig1 = pl.figure(1)
    array_functions.plot_heatmap(data.x[I1, :], data.y[I1], sizes=30, alpha=1, subtract_min=True, fig=fig1)
    pl.xlabel('Longitude')
    pl.ylabel('Latitude')
    pl.title('Taxi Pickups')
    #pl.xticks([], [])
    #pl.yticks([], [])
    fig2 = pl.figure(2)
    array_functions.plot_heatmap(data.x[I2, :], data.y[I2], sizes=30, alpha=1, subtract_min=True, fig=fig2)
    pl.xlabel('Longitude')
    pl.ylabel('Latitude')