def create_wine(data_to_create=WINE_RED): red_file = "wine/winequality-red.csv" white_file = "wine/winequality-white.csv" field_names, red_data = load_csv(red_file, delim=";") white_data = load_csv(white_file, delim=";")[1] if data_to_create == WINE_TRANSFER: red_ids = np.zeros((red_data.shape[0], 1)) white_ids = np.ones((white_data.shape[0], 1)) red_data = np.hstack((red_data, red_ids)) white_data = np.hstack((white_data, white_ids)) wine_data = np.vstack((red_data, white_data)) ids = wine_data[:, -1] x = wine_data[:, :-2] y = wine_data[:, -2] used_field_names = field_names[:-1] viz = True if viz: learner = make_learner() # learner = None viz_features(x, y, ids, used_field_names, alpha=0.01, learner=learner) suffix = "transfer" else: if data_to_create == WINE_RED: wine_data = red_data suffix = "red" elif data_to_create == WINE_WHITE: wine_data = white_data suffix = "white" else: assert False ids = None x = wine_data[:, :-1] y = wine_data[:, -1] used_field_names = field_names[:-1] data = data_class.Data() data.x = data.x = array_functions.standardize(x) if data_to_create == WINE_TRANSFER: pass # feat_idx = 1 # data.x = array_functions.vec_to_2d(x[:,feat_idx]) data.y = y data.set_train() data.set_target() data.set_true_y() data.data_set_ids = ids data.is_regression = True """ data = data.rand_sample(.25, data.data_set_ids == 0) data = data.rand_sample(.1, data.data_set_ids == 1) s = wine_file % ('-small-' + str(data.p)) """ s = wine_file % ("-" + suffix) helper_functions.save_object(s, data)
def create_energy(): file = 'energy/ENB2012_data.csv' field_names, energy_data = load_csv(file) domain_ids = energy_data[:, 4] x = energy_data y = energy_data[:, -2] from methods import method #learner = method.NadarayaWatsonMethod() learner = None viz_features(x, y, domain_ids, field_names, learner=learner) pass
def create_energy(): file = "energy/ENB2012_data.csv" field_names, energy_data = load_csv(file) domain_ids = energy_data[:, 4] x = energy_data y = energy_data[:, -2] from methods import method # learner = method.NadarayaWatsonMethod() learner = None viz_features(x, y, domain_ids, field_names, learner=learner) pass
def create_pair(i, j, y_col): file = pair_file(i, j) data_i = load_pair(i) data_j = load_pair(j) data_all = np.vstack((data_i, data_j)) x = data_all[:, 1 - y_col] y = data_all[:, y_col] domain_ids = np.zeros(data_i.shape[0] + data_j.shape[0]) domain_ids[data_i.shape[0] :] = 1 # viz_features(x,y,domain_ids,learner=make_learner()) viz_features(x, y, domain_ids, learner=None) create_and_save_data(x, y, domain_ids, file)
def create_pair(i, j, y_col): file = pair_file(i, j) data_i = load_pair(i) data_j = load_pair(j) data_all = np.vstack((data_i, data_j)) x = data_all[:, 1 - y_col] y = data_all[:, y_col] domain_ids = np.zeros(data_i.shape[0] + data_j.shape[0]) domain_ids[data_i.shape[0]:] = 1 #viz_features(x,y,domain_ids,learner=make_learner()) viz_features(x, y, domain_ids, learner=None) create_and_save_data(x, y, domain_ids, file)
def create_forest_fires(): months = { 'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12 } days = { 'sun': 1, 'mon': 2, 'tue': 3, 'wed': 4, 'thu': 5, 'fri': 6, 'sat': 7 } #month_to_season = lambda x : (months[x]-1)/3 month_to_season = lambda x: months[x] day_to_int = lambda x: days[x] file = 'forest_fires/forestfires.csv' converters = {2: month_to_season, 3: day_to_int} field_names, forest_data = load_csv(file, dtype='float', converters=converters) x = forest_data y = forest_data[:, -1] i = field_names == 'month' domain_ids = forest_data[:, i] months_to_use = np.asarray([6, 7, 8]) #months_to_use = np.asarray([1,2,3,4,5,6,7,8,9,10,11,12]) to_use = array_functions.find_set(domain_ids, months_to_use) x = x[to_use, :] y = y[to_use] domain_ids = domain_ids[to_use] x = x[:, 4:] field_names = field_names[4:] I = (y > 0) & (y < 700) x = x[I, :] y = y[I] domain_ids = domain_ids[I] from methods import method learner = method.NadarayaWatsonMethod() viz_features(x, y, domain_ids, field_names, learner=learner) pass
def create_mpg(): file = "mpg/auto-mpg.data.txt" # field_names, mpg_data = load_csv(file,has_field_names=False,dtype='string',delim=' ') data = pd.read_csv(file, skiprows=0, delim_whitespace=True, dtype="string") data = np.asarray(data)[:, 0:-1] has_missing_values = (data == "?").any(1) data = data[~has_missing_values, :] data = data.astype("float") domain_ids = data[:, 1] x = data y = data[:, 0] viz_features(x, y, domain_ids) pass
def create_mpg(): file = 'mpg/auto-mpg.data.txt' #field_names, mpg_data = load_csv(file,has_field_names=False,dtype='string',delim=' ') data = pd.read_csv(file, skiprows=0, delim_whitespace=True, dtype='string') data = np.asarray(data)[:, 0:-1] has_missing_values = (data == '?').any(1) data = data[~has_missing_values, :] data = data.astype('float') domain_ids = data[:, 1] x = data y = data[:, 0] viz_features(x, y, domain_ids) pass
def create_concrete(transfer=False): file = 'concrete/Concrete_Data.csv' used_field_names, concrete_data = load_csv(file) data = data_class.Data() t = '' if transfer: feat_ind = 0 domain_ind = (used_field_names == 'age').nonzero()[0][0] ages = concrete_data[:, domain_ind] domain_ids = np.zeros(ages.shape) domain_ids[ages < 10] = 1 domain_ids[(ages >= 10) & (ages <= 28)] = 2 domain_ids[ages > 75] = 3 data.x = concrete_data[:, 0:(concrete_data.shape[1] - 2)] #0,3,5 #data.x = preprocessing.scale(data.x) if concrete_num_feats == 1: data.x = array_functions.vec_to_2d(data.x[:, feat_ind]) t = '-feat=' + str(feat_ind) elif concrete_num_feats >= data.x.shape[1]: t = '-' + str(min(data.x.shape[1], concrete_num_feats)) else: assert False data.data_set_ids = domain_ids else: data.x = concrete_data[:, 0:-1] data.y = concrete_data[:, -1] data.set_train() data.set_target() data.set_true_y() data.is_regression = True viz = False if viz: to_use = domain_ids > 0 domain_ids = domain_ids[to_use] concrete_data = concrete_data[to_use, :] np.delete(concrete_data, domain_ind, 1) viz_features(concrete_data, concrete_data[:, -1], domain_ids, used_field_names) return data.x = array_functions.standardize(data.x) #viz_features(data.x,data.y,data.data_set_ids) s = concrete_file % t helper_functions.save_object(s, data)
def create_concrete(transfer=False): file = "concrete/Concrete_Data.csv" used_field_names, concrete_data = load_csv(file) data = data_class.Data() t = "" if transfer: feat_ind = 0 domain_ind = (used_field_names == "age").nonzero()[0][0] ages = concrete_data[:, domain_ind] domain_ids = np.zeros(ages.shape) domain_ids[ages < 10] = 1 domain_ids[(ages >= 10) & (ages <= 28)] = 2 domain_ids[ages > 75] = 3 data.x = concrete_data[:, 0 : (concrete_data.shape[1] - 2)] # 0,3,5 # data.x = preprocessing.scale(data.x) if concrete_num_feats == 1: data.x = array_functions.vec_to_2d(data.x[:, feat_ind]) t = "-feat=" + str(feat_ind) elif concrete_num_feats >= data.x.shape[1]: t = "-" + str(min(data.x.shape[1], concrete_num_feats)) else: assert False data.data_set_ids = domain_ids else: data.x = concrete_data[:, 0:-1] data.y = concrete_data[:, -1] data.set_train() data.set_target() data.set_true_y() data.is_regression = True viz = False if viz: to_use = domain_ids > 0 domain_ids = domain_ids[to_use] concrete_data = concrete_data[to_use, :] np.delete(concrete_data, domain_ind, 1) viz_features(concrete_data, concrete_data[:, -1], domain_ids, used_field_names) return data.x = array_functions.standardize(data.x) # viz_features(data.x,data.y,data.data_set_ids) s = concrete_file % t helper_functions.save_object(s, data)
def create_forest_fires(): months = { "jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12, } days = {"sun": 1, "mon": 2, "tue": 3, "wed": 4, "thu": 5, "fri": 6, "sat": 7} # month_to_season = lambda x : (months[x]-1)/3 month_to_season = lambda x: months[x] day_to_int = lambda x: days[x] file = "forest_fires/forestfires.csv" converters = {2: month_to_season, 3: day_to_int} field_names, forest_data = load_csv(file, dtype="float", converters=converters) x = forest_data y = forest_data[:, -1] i = field_names == "month" domain_ids = forest_data[:, i] months_to_use = np.asarray([6, 7, 8]) # months_to_use = np.asarray([1,2,3,4,5,6,7,8,9,10,11,12]) to_use = array_functions.find_set(domain_ids, months_to_use) x = x[to_use, :] y = y[to_use] domain_ids = domain_ids[to_use] x = x[:, 4:] field_names = field_names[4:] I = (y > 0) & (y < 700) x = x[I, :] y = y[I] domain_ids = domain_ids[I] from methods import method learner = method.NadarayaWatsonMethod() viz_features(x, y, domain_ids, field_names, learner=learner) pass
def create_bike_sharing(): file = "bike_sharing/day.csv" columns = [0] + range(2, 16) all_field_names = pd.read_csv(file, nrows=1, dtype="string") all_field_names = np.asarray(all_field_names.keys()) used_field_names = all_field_names[columns] bike_data = np.loadtxt(file, skiprows=1, delimiter=",", usecols=columns) domain_ind = used_field_names == "yr" domain_ids = np.squeeze(bike_data[:, domain_ind]) # inds_to_keep = (used_field_names == 'temp') | (used_field_names == 'atemp') # bike_data = bike_data[:,inds_to_keep] # used_field_names = used_field_names[inds_to_keep] viz = True to_use = np.asarray([8, 9, 10, 11]) x = bike_data[:, to_use] used_field_names = used_field_names[to_use] y = bike_data[:, -1] if viz: # learner = make_learner() learner = None viz_features(x, y, domain_ids, used_field_names, learner=learner) field_to_use = 1 x = x[:, field_to_use] data = data_class.Data() data.is_regression = True data.x = array_functions.vec_to_2d(x) data.x = array_functions.standardize(data.x) data.y = y data.y = array_functions.normalize(data.y) data.set_defaults() data.data_set_ids = domain_ids s = bike_file % ("-feat=" + str(field_to_use)) helper_functions.save_object(s, data) pass
def create_bike_sharing(): file = 'bike_sharing/day.csv' columns = [0] + range(2, 16) all_field_names = pd.read_csv(file, nrows=1, dtype='string') all_field_names = np.asarray(all_field_names.keys()) used_field_names = all_field_names[columns] bike_data = np.loadtxt(file, skiprows=1, delimiter=',', usecols=columns) domain_ind = used_field_names == 'yr' domain_ids = np.squeeze(bike_data[:, domain_ind]) #inds_to_keep = (used_field_names == 'temp') | (used_field_names == 'atemp') #bike_data = bike_data[:,inds_to_keep] #used_field_names = used_field_names[inds_to_keep] viz = True to_use = np.asarray([8, 9, 10, 11]) x = bike_data[:, to_use] used_field_names = used_field_names[to_use] y = bike_data[:, -1] if viz: #learner = make_learner() learner = None viz_features(x, y, domain_ids, used_field_names, learner=learner) field_to_use = 1 x = x[:, field_to_use] data = data_class.Data() data.is_regression = True data.x = array_functions.vec_to_2d(x) data.x = array_functions.standardize(data.x) data.y = y data.y = array_functions.normalize(data.y) data.set_defaults() data.data_set_ids = domain_ids s = bike_file % ('-feat=' + str(field_to_use)) helper_functions.save_object(s, data) pass
def create_wine(data_to_create=WINE_RED): red_file = 'wine/winequality-red.csv' white_file = 'wine/winequality-white.csv' field_names, red_data = load_csv(red_file, delim=';') white_data = load_csv(white_file, delim=';')[1] if data_to_create == WINE_TRANSFER: red_ids = np.zeros((red_data.shape[0], 1)) white_ids = np.ones((white_data.shape[0], 1)) red_data = np.hstack((red_data, red_ids)) white_data = np.hstack((white_data, white_ids)) wine_data = np.vstack((red_data, white_data)) ids = wine_data[:, -1] x = wine_data[:, :-2] y = wine_data[:, -2] used_field_names = field_names[:-1] viz = True if viz: learner = make_learner() #learner = None viz_features(x, y, ids, used_field_names, alpha=.01, learner=learner) suffix = 'transfer' else: if data_to_create == WINE_RED: wine_data = red_data suffix = 'red' elif data_to_create == WINE_WHITE: wine_data = white_data suffix = 'white' else: assert False ids = None x = wine_data[:, :-1] y = wine_data[:, -1] used_field_names = field_names[:-1] data = data_class.Data() data.x = data.x = array_functions.standardize(x) if data_to_create == WINE_TRANSFER: pass #feat_idx = 1 #data.x = array_functions.vec_to_2d(x[:,feat_idx]) data.y = y data.set_train() data.set_target() data.set_true_y() data.data_set_ids = ids data.is_regression = True ''' data = data.rand_sample(.25, data.data_set_ids == 0) data = data.rand_sample(.1, data.data_set_ids == 1) s = wine_file % ('-small-' + str(data.p)) ''' s = wine_file % ('-' + suffix) helper_functions.save_object(s, data)