Exemple #1
0
def print_cluster_purity(k_means, X, Y, allow_recurse=True):
    cluster_inds = k_means.fit_predict(X)
    avg_variance = 0
    for i in range(k_means.n_clusters):
        I = cluster_inds == i
        yi = Y[I]
        perc = I.mean()
        print 'Cluster ' + str(i) + ': perc=' + str(perc)
        print 'Mean: ' + str(yi.mean())
        print 'STD: ' + str(yi.std())
        if array_functions.in_range(yi.mean(), .2, .8) and I.mean() > .2 and allow_recurse:
            print 'Splitting cluster'
            k_means_sub = KMeans(n_clusters=4)
            Xi = X[I, :]
            Yi = Y[I]
            array_functions.plot_heatmap(Xi, Yi + .5, subtract_min=False)
            k_means_sub.fit(Xi)
            print_cluster_purity(k_means_sub, Xi, Yi, allow_recurse=False)
        avg_variance += I.mean() * yi.std()
    print 'Average STD: ' + str(avg_variance)
Exemple #2
0
        sampled_boolean = array_functions.false(I.size)
        sampled_boolean[sampled] = True
        I &= sampled_boolean
        I1 &= I
        I2 &= I
        '''

    print 'n1: ' + str(I1.sum())
    print 'n2: ' + str(I2.sum())

    fig1 = pl.figure(3)

    dot_size = 30
    array_functions.plot_heatmap(x[I1, :],
                                 y[I1],
                                 sizes=dot_size,
                                 alpha=1,
                                 subtract_min=False,
                                 fig=fig1)
    pl.title('Values 1')
    fig2 = pl.figure(4)
    array_functions.plot_heatmap(x[I2, :],
                                 y[I2],
                                 sizes=dot_size,
                                 alpha=1,
                                 subtract_min=False,
                                 fig=fig2)
    pl.title('Values 2')
    array_functions.move_fig(fig1, 500, 500, 2000, 100)
    array_functions.move_fig(fig2, 500, 500, 2600, 100)
    pl.show(block=True)
def load_trip_data(file_names, y_names, time_name, loc_names, resolution=np.asarray([20, 20]), plot_data=True):
    resolution = np.asarray(resolution)
    feat_names = None
    data = None
    for file_name in file_names:
        curr_feat_names, curr_data = load_csv(file_name, True, dtype="str", delim=",", num_rows=1000000000)
        if feat_names is None:
            feat_names = curr_feat_names
            data = curr_data
            continue
        assert (feat_names == curr_feat_names).all()
        data = np.vstack((data, curr_data))
    locs = data[:, array_functions.find_set(feat_names, loc_names)]
    y_inds = None
    if y_names is not None:
        y_inds = array_functions.find_set(feat_names, y_names).nonzero()[0]
        y = data[:, y_inds].astype(np.float)
    else:
        y = np.ones(data.shape[0])
    date_strs = data[:, find_first_element(feat_names, time_name)]
    date_str_to_idx = dict()
    date_ids = np.zeros(data.shape[0])
    for i, date_str in enumerate(date_strs):
        date_obj = to_date(date_str)
        date_str_to_idx[date_str] = date_obj.toordinal()
        date_ids[i] = date_obj.toordinal()
    date_ids = date_ids.astype(np.int)

    min_date_id = date_ids.min()
    max_date_id = date_ids.max()
    num_days = max_date_id - min_date_id + 1
    dates_idx = date_ids - min_date_id
    num_locations = np.prod(resolution)
    trip_counts = 0 * np.ones((num_days, num_locations))
    locs = locs.astype(np.float)
    p_min = 0.3
    p_max = 0.7
    is_in_range = array_functions.is_in_percentile(locs[:, 0], p_min, p_max) & array_functions.is_in_percentile(
        locs[:, 1], p_min, p_max
    )
    locs = locs[is_in_range, :]
    dates_idx = dates_idx[is_in_range]
    x_bins = quantize_loc(locs[:, 0], resolution[0])
    y_bins = quantize_loc(locs[:, 1], resolution[1])
    # array_functions.plot_2d(locs[I,0],locs[I,1])
    xy_bins = list(itertools.product(range(resolution[0]), range(resolution[1])))
    for x_idx, y_idx in xy_bins:
        is_in_cell = (x_bins == x_idx) & (y_bins == y_idx)
        trips_in_cell = dates_idx[is_in_cell]
        trip_dates, trips_per_date = np.unique(trips_in_cell, return_counts=True)
        bin_idx = bin_to_idx([x_idx, y_idx], resolution)
        trip_counts[trip_dates, bin_idx] = trips_per_date
    # y = trip_counts[[0, 3], :].T
    tuesday_saturday_idx = np.asarray([0, 4])
    first_tuesday_idx = np.asarray([0, 154])

    # y = trip_counts[first_tuesday_idx + 0, :].T
    """
    y1 = trip_counts[:30,:].sum(0)
    y2 = trip_counts[154:, :].sum(0)
    """
    y1 = trip_counts[:30:7, :].mean(0)
    y2 = trip_counts[4:30:7, :].mean(0)
    y = np.stack((y1, y2), 1)

    # y[y > 100] = 0
    # y[y > 5000] = 0
    # y[y == y.max()] == 0
    y = np.log(y)
    if plot_data:
        array_functions.plot_heatmap(np.asarray(xy_bins), y, sizes=50)
    return np.asarray(xy_bins, dtype=np.float), y, np.asarray([str(xy) for xy in xy_bins])
times_series_vals[times_series_vals < 0] = np.nan

plot_2d = True
plot_multiple_stations = True
y_to_plot = 3
if plot_data:
    if plot_2d:
        for i in range(num_days):
            if use_monthly:
                y_val = times_series_vals[[i, i+4], :, y_to_plot]
            else:
                y_val = times_series_vals[[i,60+i],:,y_to_plot]
                y_val1 = times_series_vals[range(i,i+30),:,y_to_plot].mean(0)
                y_val2 = times_series_vals[range(i+120, i + 150), :, y_to_plot].mean(0)
                y_val = np.stack((y_val1, y_val2), 1).T
            array_functions.plot_heatmap(unique_locs,y_val.T,alpha=1,title=None,sizes=None,share_axis=True)
    elif plot_multiple_stations:
        for i in range(0,400, 10):
            is_in_state = np.arange(i,i+10)
            #y_val = times_series_vals[is_in_state, :800, 1].T
            y_val = times_series_vals[:,is_in_state[:], y_to_plot]
            x_val = range(y_val.shape[0])
            #print unique_series_ids[to_use]
            for i, s in enumerate(unique_series_ids[is_in_state]):
                print str(i) + ': ' + s
            array_functions.plot_2d_sub_multiple_y(np.asarray(x_val), y_val, title=None, sizes=10)
    else:
        for i in range(times_series_vals.shape[1]):
            y_val = times_series_vals[:, i, :]
            x_val = np.arange(y_val.shape[0])
            if not np.isfinite(y_val).sum(0).all():
    '''
    for d in unique_dates:
        times_series_vals[d,i] = y[I[dates_idx == d]].mean()
    pass
    '''
    '''
    for j in I:
        print date_strs[j]
    '''
    '''
    print 'num_items: ' + str(I.size)
    print 'start: ' + date_strs[I[0]]
    print 'end: ' + date_strs[I[-1]]
    '''

has_loc = array_functions.false(unique_series_ids.size)
for i, id in enumerate(unique_series_ids):
    has_loc[i] = id in station_names
times_series_vals = times_series_vals[:, has_loc]
unique_series_ids = unique_series_ids[has_loc]
date_idx = 0
for i in range(0, num_days, 5):
    x = station_locs
    y = times_series_vals[i:120:28, :]
    array_functions.plot_heatmap(x, y.T, title=None, sizes=30)

data = (times_series_vals, unique_series_ids)
helper_functions.save_object('processed_data.pkl', data)

pass
Exemple #6
0
def vis_data():
    s = 'data_sets/' + data_file_dir + '/raw_data.pkl'
    data = helper_functions.load_object(s)
    x = data.x
    y = data.y
    titles = ['', '']
    label_idx = [0, 1]
    if plot_climate:
        img_path = 'C:/PythonFramework/far_transfer/figures/climate-terrain.png'
        image = imread(img_path)
        label_idx = [0, 4]
    if data_file_dir == 'climate-month':
        titles = [
            'Max Temperature Gradient: January',
            'Max Temperature Gradient: April'
        ]
        label_idx = [0, 4]
    elif data_file_dir == 'irs-income':
        titles = ['Income', 'Household Size']
    elif data_file_dir == 'zillow-traffic':
        titles = ['Morning Taxi Pickups', 'Housing Prices']
    elif data_file_dir == 'kc-housing-spatial-floors':
        titles = ['House Prices: 1 Floor', 'House Prices: 2 or More Floors']

    if plot_features:

        for i in range(data.p):
            xi = x[:, i]
            title = 'Feature Names Missing'
            if data.feature_names is not None:
                title = data.feature_names[i]
            array_functions.plot_2d(xi,
                                    y,
                                    data_set_ids=data.data_set_ids,
                                    title=title)
    else:
        for i, title in zip(label_idx, titles):
            #plt.close()
            I = data.data_set_ids == i
            if plot_gradients or plot_values:
                g, v = estimate_gradients(x, y, I)
                if plot_values:
                    g = v
                #g = np.log(g)
                #g -= g.min()
                #g += g.max()/10.0
                #g /= g.max()
                if data_file_dir == 'zillow-traffic':
                    if i == 0:
                        pass
                        g -= g.min()
                        g /= g.max()
                        #g **= .5
                    else:
                        pass
                        g -= g.min()
                        g /= g.max()
                        #g **= .5
                else:
                    if i == 0:
                        g -= g.min()
                        g /= g.max()
                        g = np.sqrt(g)
                    else:
                        g -= g.min()
                        g /= g.max()
                        g **= 1
                #array_functions.plot_heatmap(g, sizes=dot_sizes, fig=fig, title=title)
                fig = plt.figure(i)
                plt.title(title)
                plt.axis('off')
                plt.imshow(g)
                array_functions.move_fig(fig, 750, 400)
                #plt.show(block=False)
            else:
                fig = plt.figure(4)
                array_functions.plot_heatmap(x[I, :],
                                             y[I],
                                             sizes=dot_sizes,
                                             fig=fig,
                                             title=title)
                if plot_climate:
                    plt.imshow(image, zorder=0, extent=[-90, -78, 33.5, 38])
                    array_functions.move_fig(fig, 1400, 600)
        plt.show(block=True)

    pass
zipcode_array = np.zeros(len(zipcodes))
income_array = np.zeros(len(zipcodes))
locs = np.zeros((len(zipcodes), 2))
households = np.zeros(len(zipcodes))

for i, key in enumerate(zipcodes):
    zipcode_array[i] = key
    income_array[i] = zipcode_income[key]
    locs[i] = zipcode_locs[key]
    households[i] = zipcode_housing[key]

income_array = np.log(income_array)
income_array = array_functions.normalize(income_array)

households = array_functions.normalize(households)

locs[:, 0] = array_functions.normalize(locs[:, 0])
locs[:, 1] = array_functions.normalize(locs[:, 1])

#array_functions.plot_heatmap(locs, 10*income_array, sizes=50)
#array_functions.plot_heatmap(locs, households, sizes=50)
y = np.stack((income_array, households), 1)
print 'Num Used: ' + str(y.shape[0])
array_functions.plot_heatmap(locs, y, sizes=100, share_axis=True)
I = np.random.choice(y.shape[0], 400, replace=False)
data = (locs[I, :], y[I], zipcode_array[I])
helper_functions.save_object('processed_data.pkl', data)

pass
    for d in unique_dates:
        times_series_vals[d,i] = y[I[dates_idx == d]].mean()
    pass
    '''
    '''
    for j in I:
        print date_strs[j]
    '''
    '''
    print 'num_items: ' + str(I.size)
    print 'start: ' + date_strs[I[0]]
    print 'end: ' + date_strs[I[-1]]
    '''

has_loc = array_functions.false(unique_series_ids.size)
for i, id in enumerate(unique_series_ids):
    has_loc[i] = id in station_names
times_series_vals = times_series_vals[:, has_loc]
unique_series_ids = unique_series_ids[has_loc]
date_idx = 0
for i in range(0,num_days, 5):
    x = station_locs
    y = times_series_vals[i:120:28,:]
    array_functions.plot_heatmap(x, y.T, title=None, sizes=30)


data = (times_series_vals,unique_series_ids)
helper_functions.save_object('processed_data.pkl', data)

pass
def load_trip_data(file_names,
                   y_names,
                   time_name,
                   loc_names,
                   resolution=np.asarray([20, 20]),
                   plot_data=True):
    resolution = np.asarray(resolution)
    feat_names = None
    data = None
    for file_name in file_names:
        curr_feat_names, curr_data = load_csv(file_name,
                                              True,
                                              dtype='str',
                                              delim=',',
                                              num_rows=1000000000)
        if feat_names is None:
            feat_names = curr_feat_names
            data = curr_data
            continue
        assert (feat_names == curr_feat_names).all()
        data = np.vstack((data, curr_data))
    locs = data[:, array_functions.find_set(feat_names, loc_names)]
    y_inds = None
    if y_names is not None:
        y_inds = array_functions.find_set(feat_names, y_names).nonzero()[0]
        y = data[:, y_inds].astype(np.float)
    else:
        y = np.ones(data.shape[0])
    date_strs = data[:, find_first_element(feat_names, time_name)]
    date_str_to_idx = dict()
    date_ids = np.zeros(data.shape[0])
    for i, date_str in enumerate(date_strs):
        date_obj = to_date(date_str)
        date_str_to_idx[date_str] = date_obj.toordinal()
        date_ids[i] = date_obj.toordinal()
    date_ids = date_ids.astype(np.int)

    min_date_id = date_ids.min()
    max_date_id = date_ids.max()
    num_days = max_date_id - min_date_id + 1
    dates_idx = date_ids - min_date_id
    num_locations = np.prod(resolution)
    trip_counts = np.zeros((num_days, num_locations))
    locs = locs.astype(np.float)
    p_min = .3
    p_max = .7
    is_in_range = array_functions.is_in_percentile(
        locs[:, 0], p_min, p_max) & array_functions.is_in_percentile(
            locs[:, 1], p_min, p_max)
    locs = locs[is_in_range, :]
    dates_idx = dates_idx[is_in_range]
    x_bins = quantize_loc(locs[:, 0], resolution[0])
    y_bins = quantize_loc(locs[:, 1], resolution[1])
    #array_functions.plot_2d(locs[I,0],locs[I,1])
    xy_bins = list(
        itertools.product(range(resolution[0]), range(resolution[1])))
    for x_idx, y_idx in xy_bins:
        is_in_cell = (x_bins == x_idx) & (y_bins == y_idx)
        trips_in_cell = dates_idx[is_in_cell]
        trip_dates, trips_per_date = np.unique(trips_in_cell,
                                               return_counts=True)
        bin_idx = bin_to_idx([x_idx, y_idx], resolution)
        trip_counts[trip_dates, bin_idx] = trips_per_date
    #y = trip_counts[[0, 3], :].T
    tuesday_saturday_idx = np.asarray([0, 4])
    first_tuesday_idx = np.asarray([0, 154])

    #y = trip_counts[first_tuesday_idx + 0, :].T
    '''
    y1 = trip_counts[:30,:].sum(0)
    y2 = trip_counts[154:, :].sum(0)
    '''
    y1 = trip_counts[3:30:7, :].mean(0)
    y2 = trip_counts[4:30:7, :].mean(0)
    y = np.stack((y1, y2), 1)

    #y[y > 100] = 0
    #y[y > 5000] = 0
    #y[y == y.max()] == 0
    y = np.log(y)
    if plot_data:
        array_functions.plot_heatmap(np.asarray(xy_bins), y, sizes=50)
    return np.asarray(xy_bins, dtype=np.float), y, np.asarray(
        [str(xy) for xy in xy_bins])
Exemple #10
0
     vals_to_use = all_vals[:, is_all_finite]
     site_variance = vals_to_use.var(0)
     normalized_vals = vals_to_use.copy()
     for i in range(normalized_vals.shape[0]):
         v = normalized_vals[i, :]
         v -= v.mean()
         v /= v.var()
         normalized_vals[i, :] = v
     normalized_site_variance = normalized_vals.var(0)
     normalized_site_variance[normalized_site_variance > .35] = 1
     #array_functions.plot_heatmap(unique_locs[is_all_finite], site_variance / site_variance.max(), alpha=1, title=None, sizes=None, share_axis=True)
     #normalized_site_variance += .5
     array_functions.plot_heatmap(unique_locs[is_all_finite],
                                  normalized_site_variance /
                                  normalized_site_variance.max(),
                                  alpha=1,
                                  title=None,
                                  sizes=None,
                                  share_axis=True)
     print ''
 elif plot_2d:
     for i in range(num_time_intervals):
         if use_monthly:
             y_val = times_series_vals[[i, i + 1], :, y_to_plot]
         else:
             y_val = times_series_vals[[i, 60 + i], :, y_to_plot]
             y_val1 = times_series_vals[range(i, i + 30), :,
                                        y_to_plot].mean(0)
             y_val2 = times_series_vals[range(i + 120, i + 150), :,
                                        y_to_plot].mean(0)
             y_val = np.stack((y_val1, y_val2), 1).T
    if use_log:
        day_values = np.log(day_values)
        night_values = np.log(night_values)
    else:
        suffix += '-noLog'
    if viz:

        fig1 = pl.figure(3)
        I = np.isfinite(day_values)
        I &= array_functions.in_range(day_values, min_value, max_value)
        if just_center_data:
            I = in_range(day_locs[:, 0], .2, .8) & in_range(
                day_locs[:, 1], .2, .8)
        array_functions.plot_heatmap(day_locs[I, :],
                                     day_values[I],
                                     sizes=dot_size,
                                     alpha=1,
                                     subtract_min=False,
                                     fig=fig1)
        pl.title('day values')
        fig2 = pl.figure(4)
        I = np.isfinite(night_values)
        I &= array_functions.in_range(night_values, min_value, max_value)

        if just_center_data:
            I = in_range(night_locs[:, 0], .2, .8) & in_range(
                night_locs[:, 1], .2, .8)
        array_functions.plot_heatmap(night_locs[I, :],
                                     night_values[I],
                                     sizes=dot_size,
                                     alpha=1,
                                     subtract_min=False,
for i, key in enumerate(zipcodes):
    zipcode_array[i] = key
    income_array[i] = zipcode_income[key]
    locs[i] = zipcode_locs[key]
    households[i] = zipcode_housing[key]

income_array = np.log(income_array)
income_array = array_functions.normalize(income_array)

households = array_functions.normalize(households)

locs[:,0] = array_functions.normalize(locs[:,0])
locs[:,1] = array_functions.normalize(locs[:,1])

#array_functions.plot_heatmap(locs, 10*income_array, sizes=50)
#array_functions.plot_heatmap(locs, households, sizes=50)
y = np.stack((income_array, households), 1)
print 'Num Used: ' + str(y.shape[0])
array_functions.plot_heatmap(
    locs,
    y,
    sizes=100,
    share_axis=True
)
I = np.random.choice(y.shape[0], 400, replace=False)
data = (locs[I,:], y[I], zipcode_array[I])
helper_functions.save_object('processed_data.pkl', data)

pass
Exemple #13
0
        target_relative = (source_errors.mean() - target_errors.mean())/target_errors.mean()
        source_relative = (stacking_errors.mean() - target_errors.mean()) / stacking_errors.mean()
        print 'Target ' + s + ': '  + str(target_relative)
        print 'Stacking ' + s + ': ' + str(source_relative)
        if source_relative > target_relative:
            print '!!!'
    exit()
if not viz:
    for i in range(2):
        data.x[:,i] = array_functions.normalize(data.x[:,i])
print 'n: ' + str(data.n)
if viz:
    I1 = data.data_set_ids == 0
    I2 = data.data_set_ids == 1
    fig1 = pl.figure(1)
    array_functions.plot_heatmap(data.x[I1, :], data.y[I1], sizes=30, alpha=1, subtract_min=True, fig=fig1)
    pl.xlabel('Longitude')
    pl.ylabel('Latitude')
    pl.title('Taxi Pickups')
    #pl.xticks([], [])
    #pl.yticks([], [])
    fig2 = pl.figure(2)
    array_functions.plot_heatmap(data.x[I2, :], data.y[I2], sizes=30, alpha=1, subtract_min=True, fig=fig2)
    pl.xlabel('Longitude')
    pl.ylabel('Latitude')
    pl.title('Housing Prices')
    pl.figure(1)
    pl.xlim(fig2.axes[0].get_xlim())
    pl.ylim(fig2.axes[0].get_ylim())
    #pl.xticks([], [])
    #pl.yticks([], [])