def red_wine_quality(test_size=0.2): data = np.loadtxt("data/regression/wine_quality/winequality-red.csv", delimiter=';', skiprows=1) x, y = data[:, :11], data[:, 11] return data_util.normalize_split(x, y, test_size)
def facebook_metric(test_size=0.2): data = np.loadtxt("data/regression/facebook_metrics/dataset_Facebook.csv", delimiter=';', skiprows=1, converters={1: post_type}) x, y = data[:, :18], data[:, 18] return data_util.normalize_split(x, y, test_size)
def breast_cancer(test_size=0.2): data = np.loadtxt("data/classification/breast_cancer/wdbc.data", delimiter=',', skiprows=0, converters={1: cancer_type_num}) x, y = data[:, 2:32], data[:, 1] return data_util.normalize_split(x, y, test_size)
def community_crime(test_size=0.2): used_cols = set(range(0, 128)) - set(range(0, 5)) - set(range( 101, 118)) - set(range(121, 127)) data = np.loadtxt("data/regression/communities.data", delimiter=',', usecols=used_cols) x, y = data[:, :99], data[:, 99] return data_util.normalize_split(x, y, test_size)
def gpu(test_size=0.2): data = pd.read_csv('data/regression/gpu/sgemm_product.csv') # from the 4 runs, use the average as target data['average'] = data.iloc[:, 14:18].median(axis=1) # take logarithm as suggested in data set readme data.average = data.average.apply(np.log10) x, y = data.iloc[:, :14], data.iloc[:, 18] return data_util.normalize_split(x, y, test_size)
def yeast(test_size=0.2): data = pd.read_csv('data/classification/yeast/yeast.data', delimiter='\s+', names=range(10)) # encode target classes as numbers enc = preprocessing.OrdinalEncoder() enc.fit(data.select_dtypes(include=object)) x = data.iloc[:, 1:9] y = enc.transform(data.select_dtypes(include=object)) return data_util.normalize_split(x, y[:, 1], test_size)
def seismic_bumps(test_size=0.2): data = np.loadtxt("data/classification/seismic-bumps.arff", delimiter=',', skiprows=155, converters={ 0: seismic_level, 1: seismic_level, 2: shift, 7: seismic_level }) x, y = data[:, :18].astype(int), data[:, 18].astype(int) return data_util.normalize_split(x, y, test_size)
def molecular(test_size=0.2): file = 'data/regression/molecular/ACT4_competition_training.csv' with open(file) as f: cols = f.readline().rstrip('\n').split(',') X = np.loadtxt(file, delimiter=',', usecols=range(2, len(cols)), skiprows=1, dtype=np.uint8) y = np.loadtxt(file, delimiter=',', usecols=[1], skiprows=1) np.savez('act4.npz', X, y) file = 'data/regression/molecular/ACT2_competition_training.csv' with open(file) as f: cols = f.readline().rstrip('\n').split(',') X = np.loadtxt(file, delimiter=',', usecols=range(2, len(cols)), skiprows=1, dtype=np.uint8) y = np.loadtxt(file, delimiter=',', usecols=[1], skiprows=1) np.savez('act2.npz', X, y) ac4 = np.load('act4.npz') ac2 = np.load('act2.npz') x4, y4 = ac4['arr_0'], ac4['arr_1'] x2, y2 = ac2['arr_0'], ac2['arr_1'] x4_train, x4_test, y4_train, y4_test = data_util.normalize_split( x4, y4, test_size) x2_train, x2_test, y2_train, y2_test = data_util.normalize_split( x2, y2, test_size) return x4_train, x4_test, y4_train, y4_test, x2_train, x2_test, y2_train, y2_test
def plates(test_size=0.2): data = np.loadtxt('data/classification/Plates/Faults.NNA', delimiter='\t') x, y = data[:, :27], data[:, 27:] _y, y_class = np.nonzero(y) # simple coding of target data return data_util.normalize_split(x, y_class, test_size)
def QSAR(test_size=0.2): data = np.loadtxt("data/regression/qsar_aquatic_toxicity.csv", delimiter=';') x, y = data[:, :9], data[:, 9] return data_util.normalize_split(x, y, test_size)
def concrete(test_size=0.2): data = pd.read_excel('data/regression/concrete/Concrete_Data.xls') x, y = data.iloc[:, :8], data.iloc[:, 8] return data_util.normalize_split(x, y, test_size)
def credit_card_client(test_size=0.2): data = np.loadtxt("data/classification/default_credit_card_clients.csv", delimiter=',', skiprows=2) x, y = data[:, 1:24], data[:, 24] return data_util.normalize_split(x, y, test_size)
def statlog_australian(test_size=0.2): data = np.loadtxt("data/classification/australian.dat", skiprows=0) x, y = data[:, :14], data[:, 14] return data_util.normalize_split(x, y, test_size)
def statlog_german(test_size=0.2): data = np.loadtxt("data/classification/german.data-numeric", skiprows=0) x, y = data[:, :24].astype(int), data[:, 24].astype(int) return data_util.normalize_split(x, y, test_size)
def diabetic_retinopathy(test_size=0.2): data = np.loadtxt("data/classification/messidor_features.arff", delimiter=',', skiprows=24) x, y = data[:, :18], data[:, 18] return data_util.normalize_split(x, y, test_size)