def create_stations(file):
    feat_names, data = create_data_set.load_csv(file,
                                                True,
                                                dtype='str',
                                                delim=',',
                                                num_rows=1000000000)
    names = data[:,
                 array_functions.find_first_element(feat_names, 'station_id')]
    locs = data[:, array_functions.find_set(feat_names, ['long', 'lat'])]
    return names, locs.astype(np.float)
def create_stations(file):
    feat_names, data = create_data_set.load_csv(
        file,
        True,
        dtype='str',
        delim=',',
        num_rows=1000000000
    )
    names = data[:, array_functions.find_first_element(feat_names, 'station_id')]
    locs = data[:, array_functions.find_set(feat_names, ['long', 'lat'])]
    return names, locs.astype(np.float)
def get_zipcode_wages():
    income_fields, income_data = create_data_set.load_csv(file_name_income, dtype='string', return_data_frame=True)

    zipcode = income_data.ZipCode.values.astype(np.float)
    agi = income_data.AdjustedGrossIncome.values.astype('string')
    num_returns = income_data.NumberOfReturns.values.astype('string')
    i = find_first_element(zipcode, 90001)
    I = np.arange(i, zipcode.shape[0], 8)

    zipcode = zipcode[I].astype(np.int)
    agi = agi[I].astype(np.float)
    num_returns = num_returns[I].astype(np.float)
    '''
    I = agi < 5000000
    zipcode = zipcode[I]
    agi = agi[I]
    num_returns = num_returns[I]
    '''

    mean_income = agi / num_returns
    I = num_returns > 50
    d = dict(zip(zipcode[I], mean_income[I]))
    return d
def get_zipcode_wages():
    income_fields, income_data = create_data_set.load_csv(
        file_name_income, dtype='string', return_data_frame=True)

    zipcode = income_data.ZipCode.values.astype(np.float)
    agi = income_data.AdjustedGrossIncome.values.astype('string')
    num_returns = income_data.NumberOfReturns.values.astype('string')
    i = find_first_element(zipcode, 90001)
    I = np.arange(i, zipcode.shape[0], 8)

    zipcode = zipcode[I].astype(np.int)
    agi = agi[I].astype(np.float)
    num_returns = num_returns[I].astype(np.float)
    '''
    I = agi < 5000000
    zipcode = zipcode[I]
    agi = agi[I]
    num_returns = num_returns[I]
    '''

    mean_income = agi / num_returns
    I = (num_returns > 50) & (mean_income < np.percentile(mean_income, 99.6))
    d = dict(zip(zipcode[I], mean_income[I]))
    return d
        file,
        True,
        dtype='str',
        delim=',',
        num_rows=1000000000
    )
    inds_to_use = np.asarray([j for j in range(feat_names_curr.size) if feat_names_curr[j] in feats_to_keep])
    assert inds_to_use.size == len(feats_to_keep)
    data_curr = data_curr[:, inds_to_use]
    feat_names_curr = feat_names_curr[inds_to_use]
    if i == 0:
        feat_names = feat_names_curr
        data = data_curr
        continue

    unique_stations = np.unique(data[:, find_first_element(feat_names, 'STATION')].astype(np.str))
    curr_stations = data_curr[:, find_first_element(feat_names, 'STATION')].astype(np.str)
    to_remove = array_functions.false(data_curr.shape[0])
    for s in np.unique(curr_stations):
        if s not in unique_stations:
            continue
        print 'Found repeated station, removing: ' + s
        to_remove = to_remove | (curr_stations == s)
    data = np.vstack((data, data_curr[~to_remove,:]))
y_names = ['TAVG', 'TMIN', 'TMAX', 'PRCP']
y_inds = []
for name in y_names:
    y_inds.append(array_functions.find_first_element(feat_names, name))
date_strs = data[:, find_first_element(feat_names, 'DATE')]
prev = ''
date_str_to_idx = dict()
    names = data[:,
                 array_functions.find_first_element(feat_names, 'station_id')]
    locs = data[:, array_functions.find_set(feat_names, ['long', 'lat'])]
    return names, locs.astype(np.float)


station_names, station_locs = create_stations(station_file_name)

feat_names, data = create_data_set.load_csv(file_name,
                                            True,
                                            dtype='str',
                                            delim=',',
                                            num_rows=1000000000)
y_names = ['tripduration']
y_inds = array_functions.find_set(feat_names, y_names).nonzero()[0]
date_strs = data[:, find_first_element(feat_names, 'starttime')]
date_str_to_idx = dict()
date_ids = np.zeros(data.shape[0])
for i, date_str in enumerate(date_strs):
    date_obj = to_date(date_str)
    date_str_to_idx[date_str] = date_obj.toordinal()
    date_ids[i] = date_obj.toordinal()
date_ids = date_ids.astype(np.int)
y = data[:, y_inds].astype(np.float)

#y_sub = y[I, :]

#series_id = data[:, find_first_element(feat_names, 'Site Num')].astype(np.int)
a1 = data[:, find_first_element(feat_names, 'from_station_id')].astype(np.str)
a2 = data[:, find_first_element(feat_names, 'to_station_id')].astype(np.str)
#series_id = np.asarray([a + '-' + b for a,b in zip(a1,a2)])
Beispiel #7
0
feat_names, data = create_data_set.load_csv(
    file_name,
    True,
    dtype='str',
    delim=',',
    #num_rows=40000
    num_rows=100000000000)
y_names = [s + ' Mean' for s in [
    'NO2',
    'O3',
    'SO2',
    'CO',
]]
y_inds = []
for name in y_names:
    y_inds.append(array_functions.find_first_element(feat_names, name))
to_keep = array_functions.false(data.shape[0])
date_strs = data[:, find_first_element(feat_names, 'Date Local')]
prev = ''
date_str_to_idx = dict()
date_ids = np.zeros(data.shape[0])
for i, date_str in enumerate(date_strs):
    date_obj = to_date(date_str)
    date_str_to_idx[date_str] = date_obj.toordinal()
    date_ids[i] = date_obj.toordinal()
    if prev != date_str:
        to_keep[i] = True
        prev = date_str
data = data[to_keep, :]
date_strs = date_strs[to_keep]
date_ids = date_ids.astype(np.int)
    year, month, day = [int(s) for s in a]
    d = datetime.date(year, month, day)
    return d

feat_names, data = create_data_set.load_csv(
    file_name,
    True,
    dtype='str',
    delim=',',
    #num_rows=40000
    num_rows=100000000000
)
y_names = [s + ' Mean' for s in ['NO2', 'O3', 'SO2', 'CO', ]]
y_inds = []
for name in y_names:
    y_inds.append(array_functions.find_first_element(feat_names, name))
to_keep = array_functions.false(data.shape[0])
date_strs = data[:, find_first_element(feat_names, 'Date Local')]
prev = ''
date_str_to_idx = dict()
date_ids = np.zeros(data.shape[0])
for i, date_str in enumerate(date_strs):
    date_obj = to_date(date_str)
    date_str_to_idx[date_str] = date_obj.toordinal()
    date_ids[i] = date_obj.toordinal()
    if prev != date_str:
        to_keep[i] = True
        prev = date_str
data = data[to_keep, :]
date_strs = date_strs[to_keep]
date_ids = date_ids.astype(np.int)
Beispiel #9
0
    d = date(int(year), int(month), int(day))
    return d


create_geospatial_data = True
split_date = False
file_name = 'kc_house_data.csv'
save_data = True
sampled_size = 1000

feat_names, data = create_data_set.load_csv(file_name,
                                            True,
                                            dtype='str',
                                            delim=',')
y_name = 'price'
y_ind = array_functions.find_first_element(feat_names, y_name)
y = data[:, y_ind].astype(np.float)
y /= 100000
suffix = ''
if create_geospatial_data:
    x_feats = ['long', 'lat']
    x_feat_inds = array_functions.find_set(feat_names, x_feats)
    x = data[:, x_feat_inds]
    x = array_functions.remove_quotes(x)
    x = x.astype(np.float)

    x[:, 0] = array_functions.normalize(x[:, 0])
    x[:, 1] = array_functions.normalize(x[:, 1])
    I = array_functions.is_in_percentile(x[:, 0], .01, .99)
    I &= array_functions.is_in_percentile(x[:, 1], .01, .99)
    x = x[I, :]
def load_trip_data(file_names,
                   y_names,
                   time_name,
                   loc_names,
                   resolution=np.asarray([20, 20]),
                   plot_data=True):
    resolution = np.asarray(resolution)
    feat_names = None
    data = None
    for file_name in file_names:
        curr_feat_names, curr_data = load_csv(file_name,
                                              True,
                                              dtype='str',
                                              delim=',',
                                              num_rows=1000000000)
        if feat_names is None:
            feat_names = curr_feat_names
            data = curr_data
            continue
        assert (feat_names == curr_feat_names).all()
        data = np.vstack((data, curr_data))
    locs = data[:, array_functions.find_set(feat_names, loc_names)]
    y_inds = None
    if y_names is not None:
        y_inds = array_functions.find_set(feat_names, y_names).nonzero()[0]
        y = data[:, y_inds].astype(np.float)
    else:
        y = np.ones(data.shape[0])
    date_strs = data[:, find_first_element(feat_names, time_name)]
    date_str_to_idx = dict()
    date_ids = np.zeros(data.shape[0])
    for i, date_str in enumerate(date_strs):
        date_obj = to_date(date_str)
        date_str_to_idx[date_str] = date_obj.toordinal()
        date_ids[i] = date_obj.toordinal()
    date_ids = date_ids.astype(np.int)

    min_date_id = date_ids.min()
    max_date_id = date_ids.max()
    num_days = max_date_id - min_date_id + 1
    dates_idx = date_ids - min_date_id
    num_locations = np.prod(resolution)
    trip_counts = np.zeros((num_days, num_locations))
    locs = locs.astype(np.float)
    p_min = .3
    p_max = .7
    is_in_range = array_functions.is_in_percentile(
        locs[:, 0], p_min, p_max) & array_functions.is_in_percentile(
            locs[:, 1], p_min, p_max)
    locs = locs[is_in_range, :]
    dates_idx = dates_idx[is_in_range]
    x_bins = quantize_loc(locs[:, 0], resolution[0])
    y_bins = quantize_loc(locs[:, 1], resolution[1])
    #array_functions.plot_2d(locs[I,0],locs[I,1])
    xy_bins = list(
        itertools.product(range(resolution[0]), range(resolution[1])))
    for x_idx, y_idx in xy_bins:
        is_in_cell = (x_bins == x_idx) & (y_bins == y_idx)
        trips_in_cell = dates_idx[is_in_cell]
        trip_dates, trips_per_date = np.unique(trips_in_cell,
                                               return_counts=True)
        bin_idx = bin_to_idx([x_idx, y_idx], resolution)
        trip_counts[trip_dates, bin_idx] = trips_per_date
    #y = trip_counts[[0, 3], :].T
    tuesday_saturday_idx = np.asarray([0, 4])
    first_tuesday_idx = np.asarray([0, 154])

    #y = trip_counts[first_tuesday_idx + 0, :].T
    '''
    y1 = trip_counts[:30,:].sum(0)
    y2 = trip_counts[154:, :].sum(0)
    '''
    y1 = trip_counts[3:30:7, :].mean(0)
    y2 = trip_counts[4:30:7, :].mean(0)
    y = np.stack((y1, y2), 1)

    #y[y > 100] = 0
    #y[y > 5000] = 0
    #y[y == y.max()] == 0
    y = np.log(y)
    if plot_data:
        array_functions.plot_heatmap(np.asarray(xy_bins), y, sizes=50)
    return np.asarray(xy_bins, dtype=np.float), y, np.asarray(
        [str(xy) for xy in xy_bins])
Beispiel #11
0
                                                          delim=',',
                                                          num_rows=1000000000)
    inds_to_use = np.asarray([
        j for j in range(feat_names_curr.size)
        if feat_names_curr[j] in feats_to_keep
    ])
    assert inds_to_use.size == len(feats_to_keep)
    data_curr = data_curr[:, inds_to_use]
    feat_names_curr = feat_names_curr[inds_to_use]
    if i == 0:
        feat_names = feat_names_curr
        data = data_curr
        continue

    unique_stations = np.unique(
        data[:, find_first_element(feat_names, 'STATION')].astype(np.str))
    curr_stations = data_curr[:, find_first_element(feat_names, 'STATION'
                                                    )].astype(np.str)
    to_remove = array_functions.false(data_curr.shape[0])
    for s in np.unique(curr_stations):
        if s not in unique_stations:
            continue
        print 'Found repeated station, removing: ' + s
        to_remove = to_remove | (curr_stations == s)
    data = np.vstack((data, data_curr[~to_remove, :]))
y_inds = []
for name in y_names:
    y_inds.append(array_functions.find_first_element(feat_names, name))
date_strs = data[:, find_first_element(feat_names, 'DATE')]
prev = ''
date_str_to_idx = dict()
Beispiel #12
0
    x = np.vstack((x1, x2))
    y = np.concatenate((y1, y2))
    data_set_ids = np.concatenate((np.zeros(y1.size), np.ones(y2.size)))
    data = data_lib.Data(x, y)
    data.data_set_ids = data_set_ids
    data.is_regression
    return data


if use_zipcode_data:
    file = 'Zip_Zhvi_AllHomes.csv'
    data_fields, string_data = create_data_set.load_csv(file, has_field_names=True, dtype='string')
    zip_code = vec_remove_quotations(string_data[:, 1]).astype(np.int)
    state = vec_remove_quotations(string_data[:, 3])
    # year1_idx = array_functions.find_first_element(data_fields, '1996-04')
    year1_idx = array_functions.find_first_element(data_fields, '2001-01')
    # year1_idx = array_functions.find_first_element(data_fields, '2016-02')
    year2_idx = array_functions.find_first_element(data_fields, '2017-02')
    pricing_data = string_data[:, [year1_idx, year2_idx]]
    pricing_data = vec_replace(pricing_data).astype(np.float)
    zipcode_location_map = get_zipcode_locations()
    locations = np.zeros((zip_code.size, 2))
    for i, z in enumerate(zip_code):
        if z not in zipcode_location_map:
            print 'missing zipcode: ' + str(z)
            locations[i, :] = np.nan
            continue
        locations[i, :] = zipcode_location_map[z]

    all_states = np.unique(state)
else:
Beispiel #13
0
    day = 1
    d = datetime.date(year, month, day)
    return d


feat_names, data = create_data_set.load_csv(
    file_name,
    True,
    dtype='str',
    delim=',',
    #num_rows=40000
    num_rows=100000000000)
y_names = ['Value']
y_inds = []
for name in y_names:
    y_inds.append(array_functions.find_first_element(feat_names, name))
date_strs = data[:, find_first_element(feat_names, 'YYYYMM')]
prev = ''
date_str_to_idx = dict()
date_ids = np.zeros(data.shape[0])
to_keep = array_functions.true(date_strs.shape[0])
for i, date_str in enumerate(date_strs):
    if date_str[4:] == '13' or data[i, y_inds] == 'Not Available':
        to_keep[i] = False
        continue
    date_obj = to_date(date_str)
    date_str_to_idx[date_str] = date_obj.toordinal()
    date_ids[i] = date_obj.toordinal()
date_ids = date_ids[to_keep]
data = data[to_keep, :]
date_ids = date_ids.astype(np.int)
Beispiel #14
0
    year, month, day = [int(s) for s in a]
    d = datetime.date(year, month, day)
    return d


feat_names, data = create_data_set.load_csv(
    file_name,
    True,
    dtype='str',
    delim=',',
    #num_rows=40000
    num_rows=100000000000)
y_names = ['NONE'] + ['D%d' % i for i in range(5)]
y_inds = []
for name in y_names:
    y_inds.append(array_functions.find_first_element(feat_names, name))
date_strs = data[:, find_first_element(feat_names, 'validStart')]
prev = ''
date_str_to_idx = dict()
date_ids = np.zeros(data.shape[0])
for i, date_str in enumerate(date_strs):
    date_obj = to_date(date_str)
    date_str_to_idx[date_str] = date_obj.toordinal()
    date_ids[i] = date_obj.toordinal()
date_ids = date_ids.astype(np.int)
y = data[:, y_inds].astype(np.float)

#y_sub = y[I, :]

#series_id = data[:, find_first_element(feat_names, 'Site Num')].astype(np.int)
series_id = data[:, find_first_element(feat_names, 'state')] + '-' + \
def load_trip_data(file_names, y_names, time_name, loc_names, resolution=np.asarray([20, 20]), plot_data=True):
    resolution = np.asarray(resolution)
    feat_names = None
    data = None
    for file_name in file_names:
        curr_feat_names, curr_data = load_csv(file_name, True, dtype="str", delim=",", num_rows=1000000000)
        if feat_names is None:
            feat_names = curr_feat_names
            data = curr_data
            continue
        assert (feat_names == curr_feat_names).all()
        data = np.vstack((data, curr_data))
    locs = data[:, array_functions.find_set(feat_names, loc_names)]
    y_inds = None
    if y_names is not None:
        y_inds = array_functions.find_set(feat_names, y_names).nonzero()[0]
        y = data[:, y_inds].astype(np.float)
    else:
        y = np.ones(data.shape[0])
    date_strs = data[:, find_first_element(feat_names, time_name)]
    date_str_to_idx = dict()
    date_ids = np.zeros(data.shape[0])
    for i, date_str in enumerate(date_strs):
        date_obj = to_date(date_str)
        date_str_to_idx[date_str] = date_obj.toordinal()
        date_ids[i] = date_obj.toordinal()
    date_ids = date_ids.astype(np.int)

    min_date_id = date_ids.min()
    max_date_id = date_ids.max()
    num_days = max_date_id - min_date_id + 1
    dates_idx = date_ids - min_date_id
    num_locations = np.prod(resolution)
    trip_counts = 0 * np.ones((num_days, num_locations))
    locs = locs.astype(np.float)
    p_min = 0.3
    p_max = 0.7
    is_in_range = array_functions.is_in_percentile(locs[:, 0], p_min, p_max) & array_functions.is_in_percentile(
        locs[:, 1], p_min, p_max
    )
    locs = locs[is_in_range, :]
    dates_idx = dates_idx[is_in_range]
    x_bins = quantize_loc(locs[:, 0], resolution[0])
    y_bins = quantize_loc(locs[:, 1], resolution[1])
    # array_functions.plot_2d(locs[I,0],locs[I,1])
    xy_bins = list(itertools.product(range(resolution[0]), range(resolution[1])))
    for x_idx, y_idx in xy_bins:
        is_in_cell = (x_bins == x_idx) & (y_bins == y_idx)
        trips_in_cell = dates_idx[is_in_cell]
        trip_dates, trips_per_date = np.unique(trips_in_cell, return_counts=True)
        bin_idx = bin_to_idx([x_idx, y_idx], resolution)
        trip_counts[trip_dates, bin_idx] = trips_per_date
    # y = trip_counts[[0, 3], :].T
    tuesday_saturday_idx = np.asarray([0, 4])
    first_tuesday_idx = np.asarray([0, 154])

    # y = trip_counts[first_tuesday_idx + 0, :].T
    """
    y1 = trip_counts[:30,:].sum(0)
    y2 = trip_counts[154:, :].sum(0)
    """
    y1 = trip_counts[:30:7, :].mean(0)
    y2 = trip_counts[4:30:7, :].mean(0)
    y = np.stack((y1, y2), 1)

    # y[y > 100] = 0
    # y[y > 5000] = 0
    # y[y == y.max()] == 0
    y = np.log(y)
    if plot_data:
        array_functions.plot_heatmap(np.asarray(xy_bins), y, sizes=50)
    return np.asarray(xy_bins, dtype=np.float), y, np.asarray([str(xy) for xy in xy_bins])
Beispiel #16
0
    year, month, day = [int(s) for s in a]
    d = datetime.date(year, month, day)
    return d

feat_names, data = create_data_set.load_csv(
    file_name,
    True,
    dtype='str',
    delim=',',
    #num_rows=40000
    num_rows=100000000000
)
y_names = ['NONE'] + ['D%d' % i for i in range(5)]
y_inds = []
for name in y_names:
    y_inds.append(array_functions.find_first_element(feat_names, name))
date_strs = data[:, find_first_element(feat_names, 'validStart')]
prev = ''
date_str_to_idx = dict()
date_ids = np.zeros(data.shape[0])
for i, date_str in enumerate(date_strs):
    date_obj = to_date(date_str)
    date_str_to_idx[date_str] = date_obj.toordinal()
    date_ids[i] = date_obj.toordinal()
date_ids = date_ids.astype(np.int)
y = data[:, y_inds].astype(np.float)

#y_sub = y[I, :]

#series_id = data[:, find_first_element(feat_names, 'Site Num')].astype(np.int)
series_id = data[:, find_first_element(feat_names, 'state')] + '-' + \
Beispiel #17
0
    names = data[:, array_functions.find_first_element(feat_names, 'station_id')]
    locs = data[:, array_functions.find_set(feat_names, ['long', 'lat'])]
    return names, locs.astype(np.float)

station_names, station_locs = create_stations(station_file_name)

feat_names, data = create_data_set.load_csv(
    file_name,
    True,
    dtype='str',
    delim=',',
    num_rows=1000000000
)
y_names = ['tripduration']
y_inds = array_functions.find_set(feat_names, y_names).nonzero()[0]
date_strs = data[:, find_first_element(feat_names, 'starttime')]
date_str_to_idx = dict()
date_ids = np.zeros(data.shape[0])
for i, date_str in enumerate(date_strs):
    date_obj = to_date(date_str)
    date_str_to_idx[date_str] = date_obj.toordinal()
    date_ids[i] = date_obj.toordinal()
date_ids = date_ids.astype(np.int)
y = data[:, y_inds].astype(np.float)

#y_sub = y[I, :]

#series_id = data[:, find_first_element(feat_names, 'Site Num')].astype(np.int)
a1 = data[:, find_first_element(feat_names, 'from_station_id')].astype(np.str)
a2 = data[:, find_first_element(feat_names, 'to_station_id')].astype(np.str)
#series_id = np.asarray([a + '-' + b for a,b in zip(a1,a2)])
Beispiel #18
0
    day = 1
    d = datetime.date(year, month, day)
    return d

feat_names, data = create_data_set.load_csv(
    file_name,
    True,
    dtype='str',
    delim=',',
    #num_rows=40000
    num_rows=100000000000
)
y_names = ['Value']
y_inds = []
for name in y_names:
    y_inds.append(array_functions.find_first_element(feat_names, name))
date_strs = data[:, find_first_element(feat_names, 'YYYYMM')]
prev = ''
date_str_to_idx = dict()
date_ids = np.zeros(data.shape[0])
to_keep = array_functions.true(date_strs.shape[0])
for i, date_str in enumerate(date_strs):
    if date_str[4:] == '13' or data[i, y_inds] == 'Not Available':
        to_keep[i] = False
        continue
    date_obj = to_date(date_str)
    date_str_to_idx[date_str] = date_obj.toordinal()
    date_ids[i] = date_obj.toordinal()
date_ids = date_ids[to_keep]
data = data[to_keep, :]
date_ids = date_ids.astype(np.int)
Beispiel #19
0
import numpy as np
from data_sets import create_data_set
from utility import array_functions
from utility import helper_functions

file_name = 'kc_house_data.csv'

feat_names, data = create_data_set.load_csv(file_name, True, dtype='str', delim=',')
feats_to_clear = ['id', 'date', 'yr_renovated', 'zipcode', 'lat', 'long']
y_name = 'price'
y_ind = array_functions.find_first_element(feat_names, y_name)
y = data[:, y_ind].astype(np.float)
y /= 100000
clear_idx = array_functions.find_set(feat_names, feats_to_clear + [y_name])
x = data[:, ~clear_idx]
x = array_functions.remove_quotes(x)
x = x.astype(np.float)

data = (x,y)
helper_functions.save_object('processed_data.pkl', data)

pass