Exemple #1
0
def get_zipcode_locations():
    file = '../zipcodes/zipcodes.txt'
    fields, zipcode_data = create_data_set.load_csv(file, has_field_names=True, dtype=np.float)
    locs = zipcode_data[:, [2,1]]
    zip_codes = zipcode_data[:,0].astype(np.int)
    zipcode_location_map = dict()
    for z, loc in zip(zip_codes, locs):
        zipcode_location_map[z] = loc
    return zipcode_location_map
def get_zipcode_locs():
    loc_fields, loc_data = create_data_set.load_csv(file_name_zip_lat_long, dtype='string', return_data_frame=True)

    zipcode = loc_data.Zipcode.values.astype(np.int)
    zip_lat = loc_data.Lat.values.astype(np.float)
    zip_lon = loc_data.Long.values.astype(np.float)
    zip_loc = np.stack((zip_lon, zip_lat), 1)
    has_loc = np.isfinite(zip_loc.sum(1))
    d = dict(zip(zipcode[has_loc], zip_loc[has_loc, :]))
    return d
def create_stations(file):
    feat_names, data = create_data_set.load_csv(file,
                                                True,
                                                dtype='str',
                                                delim=',',
                                                num_rows=1000000000)
    names = data[:,
                 array_functions.find_first_element(feat_names, 'station_id')]
    locs = data[:, array_functions.find_set(feat_names, ['long', 'lat'])]
    return names, locs.astype(np.float)
def get_zipcode_housing():
    housing_fields, housing_data = create_data_set.load_csv(file_name_house_size, dtype='string', return_data_frame=True)
    zipcodes = housing_data.ZIP.values.astype(np.float)
    totals = housing_data.Total.values.astype(np.float)
    households = housing_data.values[:,4:].astype(np.float)
    weight_vec = np.arange(1,8)
    sums = households.dot(weight_vec)
    mean_househoulds = sums / totals
    I = np.isfinite(mean_househoulds) & (totals > 100)
    d = dict(zip(zipcodes[I], mean_househoulds[I]))
    return d
def create_stations(file):
    feat_names, data = create_data_set.load_csv(
        file,
        True,
        dtype='str',
        delim=',',
        num_rows=1000000000
    )
    names = data[:, array_functions.find_first_element(feat_names, 'station_id')]
    locs = data[:, array_functions.find_set(feat_names, ['long', 'lat'])]
    return names, locs.astype(np.float)
def get_zipcode_housing():
    housing_fields, housing_data = create_data_set.load_csv(
        file_name_house_size, dtype='string', return_data_frame=True)
    zipcodes = housing_data.ZIP.values.astype(np.float)
    totals = housing_data.Total.values.astype(np.float)
    households = housing_data.values[:, 4:].astype(np.float)
    weight_vec = np.arange(1, 8)
    sums = households.dot(weight_vec)
    mean_househoulds = sums / totals
    I = np.isfinite(mean_househoulds) & (totals > 100)
    d = dict(zip(zipcodes[I], mean_househoulds[I]))
    return d
def get_zipcode_locs():
    loc_fields, loc_data = create_data_set.load_csv(file_name_zip_lat_long,
                                                    dtype='string',
                                                    return_data_frame=True)

    zipcode = loc_data.Zipcode.values.astype(np.int)
    zip_lat = loc_data.Lat.values.astype(np.float)
    zip_lon = loc_data.Long.values.astype(np.float)
    zip_loc = np.stack((zip_lon, zip_lat), 1)
    has_loc = np.isfinite(zip_loc.sum(1))
    d = dict(zip(zipcode[has_loc], zip_loc[has_loc, :]))
    return d
def get_zipcode_wages():
    income_fields, income_data = create_data_set.load_csv(file_name_income, dtype='string', return_data_frame=True)

    zipcode = income_data.ZipCode.values.astype(np.float)
    agi = income_data.AdjustedGrossIncome.values.astype('string')
    num_returns = income_data.NumberOfReturns.values.astype('string')
    i = find_first_element(zipcode, 90001)
    I = np.arange(i, zipcode.shape[0], 8)

    zipcode = zipcode[I].astype(np.int)
    agi = agi[I].astype(np.float)
    num_returns = num_returns[I].astype(np.float)
    '''
    I = agi < 5000000
    zipcode = zipcode[I]
    agi = agi[I]
    num_returns = num_returns[I]
    '''

    mean_income = agi / num_returns
    I = num_returns > 50
    d = dict(zip(zipcode[I], mean_income[I]))
    return d
def get_zipcode_wages():
    income_fields, income_data = create_data_set.load_csv(
        file_name_income, dtype='string', return_data_frame=True)

    zipcode = income_data.ZipCode.values.astype(np.float)
    agi = income_data.AdjustedGrossIncome.values.astype('string')
    num_returns = income_data.NumberOfReturns.values.astype('string')
    i = find_first_element(zipcode, 90001)
    I = np.arange(i, zipcode.shape[0], 8)

    zipcode = zipcode[I].astype(np.int)
    agi = agi[I].astype(np.float)
    num_returns = num_returns[I].astype(np.float)
    '''
    I = agi < 5000000
    zipcode = zipcode[I]
    agi = agi[I]
    num_returns = num_returns[I]
    '''

    mean_income = agi / num_returns
    I = (num_returns > 50) & (mean_income < np.percentile(mean_income, 99.6))
    d = dict(zip(zipcode[I], mean_income[I]))
    return d
import numpy as np
from data_sets import create_data_set
from utility import array_functions
from utility import helper_functions

x_file_name = 'SpecificStages-truth-feats.csv'
y_file_name = 'SpecificStages-truth.csv'

_, y = create_data_set.load_csv(y_file_name, True, dtype='str', delim='\t')
y = y[1:,:]
id_to_y = dict((yi[0], int(yi[3])) for yi in y)
pass

feature_names, feats = create_data_set.load_csv(x_file_name, True, dtype='str', delim=str('\t'))
feats = feats[1:,:]
ids = feats[:,0]
feats = np.asarray(feats, dtype='float')


x = feats[:,1:]
y = np.zeros((x.shape[0],1))
for idx, i in enumerate(ids):
    if i in id_to_y:
        y[idx] = id_to_y[i]
    else:
        print 'missing id'
        y[idx] = -1

data = (x,y)
helper_functions.save_object('processed_data.pkl', data)
Exemple #11
0
    s = s[0]
    year = s[:4]
    month = s[4:6]
    day = s[6:8]
    d = date(int(year), int(month), int(day))
    return d


create_geospatial_data = True
split_date = False
file_name = 'kc_house_data.csv'
save_data = True
sampled_size = 1000

feat_names, data = create_data_set.load_csv(file_name,
                                            True,
                                            dtype='str',
                                            delim=',')
y_name = 'price'
y_ind = array_functions.find_first_element(feat_names, y_name)
y = data[:, y_ind].astype(np.float)
y /= 100000
suffix = ''
if create_geospatial_data:
    x_feats = ['long', 'lat']
    x_feat_inds = array_functions.find_set(feat_names, x_feats)
    x = data[:, x_feat_inds]
    x = array_functions.remove_quotes(x)
    x = x.astype(np.float)

    x[:, 0] = array_functions.normalize(x[:, 0])
    x[:, 1] = array_functions.normalize(x[:, 1])
    return d

file_names = daily_file_names
if use_monthly:
    file_names = monthly_file_names

feats_to_keep = ['STATION', 'STATION_NAME', 'LATITUDE', 'LONGITUDE', 'DATE', 'TAVG', 'TMAX', 'TMIN', 'PRCP']

if use_monthly:
    feats_to_keep[1] = 'NAME'

for i, file in enumerate(file_names):
    feat_names_curr, data_curr = create_data_set.load_csv(
        file,
        True,
        dtype='str',
        delim=',',
        num_rows=1000000000
    )
    inds_to_use = np.asarray([j for j in range(feat_names_curr.size) if feat_names_curr[j] in feats_to_keep])
    assert inds_to_use.size == len(feats_to_keep)
    data_curr = data_curr[:, inds_to_use]
    feat_names_curr = feat_names_curr[inds_to_use]
    if i == 0:
        feat_names = feat_names_curr
        data = data_curr
        continue

    unique_stations = np.unique(data[:, find_first_element(feat_names, 'STATION')].astype(np.str))
    curr_stations = data_curr[:, find_first_element(feat_names, 'STATION')].astype(np.str)
    to_remove = array_functions.false(data_curr.shape[0])
def run_main():
    import caffe
    adience_caffe_model_dir = 'C:\\Users\\Aubrey\\Desktop\\cnn_age_gender_models_and_data.0.0.2\\'

    age_net_pretrained='/age_net.caffemodel'
    age_net_model_file='/deploy_age.prototxt'

    age_net = caffe.Classifier(adience_caffe_model_dir + age_net_model_file,
                               adience_caffe_model_dir + age_net_pretrained,
                               channel_swap=(2,1,0),
                               raw_scale=255,
                               image_dims=(256, 256))

    age_list=['(0, 2)','(4, 6)','(8, 12)','(15, 20)','(25, 32)','(38, 43)','(48, 53)','(60, 100)']

    adience_image_dir = 'C:\\Users\\Aubrey\\Desktop\\adience_aligned\\aligned\\'
    adience_metadata_file = 'C:\\Users\\Aubrey\\Desktop\\adience_aligned\\alined_metadata\\all_photos.csv'

    metadata = create_data_set.load_csv(adience_metadata_file,
                                         dtype='string',
                                         delim='\t',
                                         )

    column_names = metadata[0].tolist()
    photo_data = metadata[1]
    face_id_col = column_names.index('face_id')
    user_id_col = column_names.index('user_id')
    image_name_col = column_names.index('original_image')
    age_col = column_names.index('age')
    x = np.zeros((photo_data.shape[0], 512))
    y = np.zeros((photo_data.shape[0]))
    id = np.zeros((photo_data.shape[0]))
    i = 0
    last_perc_done = 0
    for idx, row in enumerate(photo_data):
        perc_done = math.floor(100 * float(idx) / len(photo_data))
        if perc_done > last_perc_done:
            last_perc_done = perc_done
            print str(perc_done) + '% done'
        image_dir = adience_image_dir + row[user_id_col] + '/'
        face_id = row[face_id_col]
        '''
        images_in_dir = os.listdir(image_dir)
        matching_images = [s for s in images_in_dir if s.find(row[image_name_col]) >= 0]
        assert len(matching_images) < 2
        if len(matching_images) == 0:
            print 'Skipping: ' + image
            continue
        '''
        image = image_dir + 'landmark_aligned_face.' + str(face_id) + '.' + row[image_name_col]
        if not os.path.isfile(image):
            print 'Skipping: ' + image
            continue
        input_image = caffe.io.load_image(image)
        age = row[age_col]
        blobs = ['fc7']
        features_age = predict_blobs(age_net,[input_image],blobs)
        x[i,:] = features_age
        y[i] = extract_age(age)
        id[i] = float(face_id)
        i += 1
    data = data_class.Data()
    data.x = x
    data.instance_ids = id
    data.y = y
    data.is_regression = True
    data.set_train()
    data.set_target()
    data.set_true_y()
    data_file = create_data_set.adience_aligned_cnn_file
    helper_functions.save_object('data_sets/' + data_file, data)
    print 'TODO'
Exemple #14
0
    data.is_regression = True
    return data

def combine_data(x1, y1, x2, y2):
    x = np.vstack((x1, x2))
    y = np.concatenate((y1, y2))
    data_set_ids = np.concatenate((np.zeros(y1.size), np.ones(y2.size)))
    data = data_lib.Data(x, y)
    data.data_set_ids = data_set_ids
    data.is_regression
    return data


if use_zipcode_data:
    file = 'Zip_Zhvi_AllHomes.csv'
    data_fields, string_data = create_data_set.load_csv(file, has_field_names=True, dtype='string')
    zip_code = vec_remove_quotations(string_data[:, 1]).astype(np.int)
    state = vec_remove_quotations(string_data[:, 3])
    # year1_idx = array_functions.find_first_element(data_fields, '1996-04')
    year1_idx = array_functions.find_first_element(data_fields, '2001-01')
    # year1_idx = array_functions.find_first_element(data_fields, '2016-02')
    year2_idx = array_functions.find_first_element(data_fields, '2017-02')
    pricing_data = string_data[:, [year1_idx, year2_idx]]
    pricing_data = vec_replace(pricing_data).astype(np.float)
    zipcode_location_map = get_zipcode_locations()
    locations = np.zeros((zip_code.size, 2))
    for i, z in enumerate(zip_code):
        if z not in zipcode_location_map:
            print 'missing zipcode: ' + str(z)
            locations[i, :] = np.nan
            continue
import numpy as np
from data_sets import create_data_set
from utility import array_functions
from utility import helper_functions
from datetime import date
from matplotlib import pyplot as pl
from data import data as data_lib

try:
    data = helper_functions.load_object('train.pkl')
except:
    file_name = 'train.csv'
    feat_names, data = create_data_set.load_csv(file_name,
                                                True,
                                                dtype=np.float,
                                                delim=',')
    data = data.astype(np.float)
    Y = data[:, 0]
    X = data[:, 1:]
    data = {'X': X, 'Y': Y}
    helper_functions.save_object('train.pkl', data)
x = data['X']
x /= 256
y = data['Y']
data = data_lib.Data(x, y)
helper_functions.save_object('raw_data.pkl', data)
pass
Exemple #16
0
from utility.array_functions import find_first_element
import datetime
file_name = 'pollution_us_2000_2016.csv'


def to_date(date_str):
    a = date_str.split('-')
    year, month, day = [int(s) for s in a]
    d = datetime.date(year, month, day)
    return d


feat_names, data = create_data_set.load_csv(
    file_name,
    True,
    dtype='str',
    delim=',',
    #num_rows=40000
    num_rows=100000000000)
y_names = [s + ' Mean' for s in [
    'NO2',
    'O3',
    'SO2',
    'CO',
]]
y_inds = []
for name in y_names:
    y_inds.append(array_functions.find_first_element(feat_names, name))
to_keep = array_functions.false(data.shape[0])
date_strs = data[:, find_first_element(feat_names, 'Date Local')]
prev = ''
def load_taxi_data(num_files_to_load=np.inf,
                   num_bins=50,
                   use_alternate=True,
                   return_coords=False):
    all_files = [
        f for f in os.listdir(data_dir) if path.isfile(path.join(data_dir, f))
    ]
    x = []
    y = []
    time = []
    has_passenger = []
    #combined_data_file = 'combined_data.pkl'
    combined_data_file = 'C:/PythonFramework/data_sets/taxi/combined_data.pkl'
    if path.exists(combined_data_file):
        print 'loading combined data...'
        all_data = helper_functions.load_object(combined_data_file)
        print 'done loading data'
    else:
        for i, file in enumerate(all_files):
            if i == num_files_to_load:
                break
            if i >= 535:
                break
            file_data = load_csv(path.join(data_dir, file),
                                 has_field_names=False,
                                 delim=str(' '))[1]
            y.append(file_data[:, 0])
            x.append(file_data[:, 1])
            has_passenger.append(file_data[:, 2])
            time.append(file_data[:, 3])
            print i
        all_data = {
            'x': x,
            'y': y,
            'has_passenger': has_passenger,
            'time': time
        }
        print 'saving combined data...'
        helper_functions.save_object(combined_data_file, all_data)
    x = all_data['x']
    y = all_data['y']
    has_passenger = all_data['has_passenger']
    time = all_data['time']
    x_all = np.concatenate(x)
    y_all = np.concatenate(y)
    time_all = np.concatenate(time)

    has_passenger_all = np.concatenate(has_passenger)

    pickup_inds = get_pickup_inds(x_all, y_all, time_all, has_passenger_all)
    if just_pickup:
        x_all = x_all[pickup_inds]
        y_all = y_all[pickup_inds]
        has_passenger_all = has_passenger_all[pickup_inds]
        time_all = time_all[pickup_inds]
    #x_bounds = [-122.45677419354838, -122.38322580645161]
    #y_bounds = [37.738054968287521, 37.816543340380548]

    x_bounds = [-122.48, -122.35]
    y_bounds = [37.7, 37.84]

    #x_bounds = [-np.inf, np.inf]
    #y_bounds = x_bounds
    is_in_range = in_range(x_all, *x_bounds) & in_range(y_all, *y_bounds)
    x_all = x_all[is_in_range]
    y_all = y_all[is_in_range]
    x_all = quantize_loc(x_all, num_bins)
    y_all = quantize_loc(y_all, num_bins)
    time_all = time_all[is_in_range]

    hours = 9 * np.ones(time_all.shape)

    get_hour_vec = np.vectorize(get_hour)
    hours = get_hour_vec(time_all)
    '''
    get_day_vec = np.vectorize(get_day)
    days = get_day_vec(time_all)
    '''
    has_passenger_all = has_passenger_all[is_in_range]

    suffix = '3'
    is_morning = (hours == 9)
    is_night = (hours == 18)
    #is_morning = (hours == 6) & (days == 21)
    #is_night = (hours == 18) & (days == 21)
    #is_morning = (days == 21)
    #is_night = (days == 24)
    if use_alternate:
        is_morning = (hours >= 5) & (hours <= 12)
        is_night = (hours >= 17)
        #is_morning = days == 21
        #is_night = days == 24
        #is_morning = (has_passenger_all == 1) & (days == 21) & is_morning
        #is_night = (has_passenger_all == 1) & (days == 21) & is_night
        #is_morning = (has_passenger_all == 1) & (hours == 6)
        #is_night = (has_passenger_all == 1) & (hours == 18)
        suffix = '2'

    suffix += '-' + str(num_bins)
    #print np.unique(days)

    #is_morning = days == 4
    #is_night = days == 8

    day_locs, day_values = count_cars(x_all[is_morning], y_all[is_morning],
                                      num_bins)
    night_locs, night_values = count_cars(x_all[is_night], y_all[is_night],
                                          num_bins)
    if return_coords:
        day_locs = bin_to_coordinates(day_locs, x_bounds, y_bounds, num_bins)
        night_locs = bin_to_coordinates(night_locs, x_bounds, y_bounds,
                                        num_bins)
    '''
    if use_alternate:
        I = (day_values > 0) | (night_values > 0)
        I = I & (day_values > 0) & (night_values > 0)
    else:
        I = (day_values > 5) | (night_values > 5)
        I = I & (day_values > 0) & (night_values > 0)
    relative_diff = np.max(day_values[I] - night_values[I]) / day_values[I]
    '''
    #array_functions.plot_heatmap(day_locs[I], relative_diff, sizes=10, alpha=1, subtract_min=False)
    return day_locs, day_values, night_locs, night_values, suffix
Exemple #18
0
def run_main():
    import caffe
    adience_caffe_model_dir = 'C:\\Users\\Aubrey\\Desktop\\cnn_age_gender_models_and_data.0.0.2\\'

    age_net_pretrained = '/age_net.caffemodel'
    age_net_model_file = '/deploy_age.prototxt'

    age_net = caffe.Classifier(adience_caffe_model_dir + age_net_model_file,
                               adience_caffe_model_dir + age_net_pretrained,
                               channel_swap=(2, 1, 0),
                               raw_scale=255,
                               image_dims=(256, 256))

    age_list = [
        '(0, 2)', '(4, 6)', '(8, 12)', '(15, 20)', '(25, 32)', '(38, 43)',
        '(48, 53)', '(60, 100)'
    ]

    adience_image_dir = 'C:\\Users\\Aubrey\\Desktop\\adience_aligned\\aligned\\'
    adience_metadata_file = 'C:\\Users\\Aubrey\\Desktop\\adience_aligned\\alined_metadata\\all_photos.csv'

    metadata = create_data_set.load_csv(
        adience_metadata_file,
        dtype='string',
        delim='\t',
    )

    column_names = metadata[0].tolist()
    photo_data = metadata[1]
    face_id_col = column_names.index('face_id')
    user_id_col = column_names.index('user_id')
    image_name_col = column_names.index('original_image')
    age_col = column_names.index('age')
    x = np.zeros((photo_data.shape[0], 512))
    y = np.zeros((photo_data.shape[0]))
    id = np.zeros((photo_data.shape[0]))
    i = 0
    last_perc_done = 0
    for idx, row in enumerate(photo_data):
        perc_done = math.floor(100 * float(idx) / len(photo_data))
        if perc_done > last_perc_done:
            last_perc_done = perc_done
            print str(perc_done) + '% done'
        image_dir = adience_image_dir + row[user_id_col] + '/'
        face_id = row[face_id_col]
        '''
        images_in_dir = os.listdir(image_dir)
        matching_images = [s for s in images_in_dir if s.find(row[image_name_col]) >= 0]
        assert len(matching_images) < 2
        if len(matching_images) == 0:
            print 'Skipping: ' + image
            continue
        '''
        image = image_dir + 'landmark_aligned_face.' + str(
            face_id) + '.' + row[image_name_col]
        if not os.path.isfile(image):
            print 'Skipping: ' + image
            continue
        input_image = caffe.io.load_image(image)
        age = row[age_col]
        blobs = ['fc7']
        features_age = predict_blobs(age_net, [input_image], blobs)
        x[i, :] = features_age
        y[i] = extract_age(age)
        id[i] = float(face_id)
        i += 1
    data = data_class.Data()
    data.x = x
    data.instance_ids = id
    data.y = y
    data.is_regression = True
    data.set_train()
    data.set_target()
    data.set_true_y()
    data_file = create_data_set.adience_aligned_cnn_file
    helper_functions.save_object('data_sets/' + data_file, data)
    print 'TODO'
import datetime
file_name = 'MER_T12_06.csv'

def to_date(date_str):
    #a = date_str.split('-')
    #year, month, day = [int(s) for s in a]
    year = int(date_str[:4])
    month = int(date_str[4:])
    day = 1
    d = datetime.date(year, month, day)
    return d

feat_names, data = create_data_set.load_csv(
    file_name,
    True,
    dtype='str',
    delim=',',
    #num_rows=40000
    num_rows=100000000000
)
y_names = ['Value']
y_inds = []
for name in y_names:
    y_inds.append(array_functions.find_first_element(feat_names, name))
date_strs = data[:, find_first_element(feat_names, 'YYYYMM')]
prev = ''
date_str_to_idx = dict()
date_ids = np.zeros(data.shape[0])
to_keep = array_functions.true(date_strs.shape[0])
for i, date_str in enumerate(date_strs):
    if date_str[4:] == '13' or data[i, y_inds] == 'Not Available':
        to_keep[i] = False
Exemple #20
0
file_names = daily_file_names
if use_monthly:
    file_names = monthly_file_names

feats_to_keep = [
    'STATION', 'STATION_NAME', 'LATITUDE', 'LONGITUDE', 'DATE', 'TAVG', 'TMAX',
    'TMIN', 'PRCP'
]

if use_monthly:
    feats_to_keep[1] = 'NAME'

for i, file in enumerate(file_names):
    feat_names_curr, data_curr = create_data_set.load_csv(file,
                                                          True,
                                                          dtype='str',
                                                          delim=',',
                                                          num_rows=1000000000)
    inds_to_use = np.asarray([
        j for j in range(feat_names_curr.size)
        if feat_names_curr[j] in feats_to_keep
    ])
    assert inds_to_use.size == len(feats_to_keep)
    data_curr = data_curr[:, inds_to_use]
    feat_names_curr = feat_names_curr[inds_to_use]
    if i == 0:
        feat_names = feat_names_curr
        data = data_curr
        continue

    unique_stations = np.unique(
Exemple #21
0
import numpy as np
import scipy
from data_sets import create_data_set
from data import data as data_lib
from utility import helper_functions


file = 'SAheart.data.txt'
all_field_names, data = create_data_set.load_csv(file, has_field_names=True,dtype='string',delim=str(','))
data[data == 'Present'] = '1'
data[data == 'Absent'] = '0'
data = data[:, 1:]
data = data.astype(np.float)
data = data_lib.Data(data[:, :-1], data[:, -1])
data.set_train()
data.set_target()
helper_functions.save_object('raw_data.pkl', data)
print ''
import numpy as np
from data_sets import create_data_set
from utility import array_functions
from utility import helper_functions

file_name = 'kc_house_data.csv'

feat_names, data = create_data_set.load_csv(file_name, True, dtype='str', delim=',')
feats_to_clear = ['id', 'date', 'yr_renovated', 'zipcode', 'lat', 'long']
y_name = 'price'
y_ind = array_functions.find_first_element(feat_names, y_name)
y = data[:, y_ind].astype(np.float)
y /= 100000
clear_idx = array_functions.find_set(feat_names, feats_to_clear + [y_name])
x = data[:, ~clear_idx]
x = array_functions.remove_quotes(x)
x = x.astype(np.float)

data = (x,y)
helper_functions.save_object('processed_data.pkl', data)

pass