def get_splits(X, y, dname, filepath='data/experiments'): """ Splits X and y datasets into training, validation, and test data sets and then saves them as CSVs Args: X (Numpy.Array): Attributes. y (Numpy.Array): Classes. dname (str): Dataset name. filepath (str): Output folder. """ # get train and test splits X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=0, stratify=y) # combine datasets np_train = np.concatenate((X_train, y_train[:, np.newaxis]), axis=1) train = pd.DataFrame(np_train) np_test = np.concatenate((X_test, y_test[:, np.newaxis]), axis=1) test = pd.DataFrame(np_test) # save datasets to CSV output_path = 'data/experiments' trainfile = '{}_train.csv'.format(dname) testfile = '{}_test.csv'.format(dname) save_dataset(train, trainfile, subdir=output_path, header=False) save_dataset(test, testfile, subdir=output_path, header=False)
def preprocess_winequality(): """Cleans and generates wine quality dataset for experiments as a CSV file. """ # get file paths sdir = 'data/winequality' tdir = 'data/experiments' wr_file = get_abspath('winequality-red.csv', sdir) ww_file = get_abspath('winequality-white.csv', sdir) # load as data frame wine_red = pd.read_csv(wr_file, sep=';') wine_white = pd.read_csv(ww_file, sep=';') # encode artifical label to determine if wine is red or not wine_red['red'] = 1 wine_white['red'] = 0 # combine datasets and format column names df = wine_red.append(wine_white) df.columns = ['_'.join(col.split(' ')) for col in df.columns] df.rename(columns={'quality': 'class'}, inplace=True) # save to CSV save_dataset(df, 'winequality.csv', sep=',', subdir=tdir)
def get_cluster_data(X, y, name, km_k, gmm_k, rdir, perplexity=30): """Generates 2D dataset that contains cluster labels for K-Means and GMM, as well as the class labels for the given dataset. Args: X (Numpy.Array): Attributes. y (Numpy.Array): Labels. name (str): Dataset name. perplexity (int): Perplexity parameter for t-SNE. km_k (int): Number of clusters for K-Means. gmm_k (int): Number of components for GMM. rdir (str): Folder to save results CSV. """ # generate 2D X dataset X2D = TSNE(n_iter=5000, perplexity=perplexity).fit_transform(X) # get cluster labels using best k km = KMeans(random_state=0).set_params(n_clusters=km_k) gmm = GMM(random_state=0).set_params(n_components=gmm_k) km_cl = np.atleast_2d(km.fit(X2D).labels_).T gmm_cl = np.atleast_2d(gmm.fit(X2D).predict(X2D)).T y = np.atleast_2d(y).T # create concatenated dataset cols = ['x1', 'x2', 'km', 'gmm', 'class'] df = pd.DataFrame(np.hstack((X2D, km_cl, gmm_cl, y)), columns=cols) # save as CSV filename = '{}_2D.csv'.format(name) save_dataset(df, filename, sep=',', subdir=rdir, header=True)
def preprocess_seismic(): """Cleans and generates seismic bumps dataset for experiments as a CSV file. Uses one-hot encoding for categorical features. """ # get file path sdir = 'data/seismic-bumps' tdir = 'data/experiments' seismic_file = get_abspath('seismic-bumps.arff', sdir) # read arff file and convert to record array rawdata = arff.loadarff(seismic_file) df = pd.DataFrame(rawdata[0]) # apply one-hot encoding to categorical features using Pandas get_dummies cat_cols = ['seismic', 'seismoacoustic', 'shift', 'ghazard'] cats = df[cat_cols] onehot_cols = pd.get_dummies(cats, prefix=cat_cols) # drop original categorical columns and append one-hot encoded columns df.drop(columns=cat_cols, inplace=True) df = pd.concat((df, onehot_cols), axis=1) # drop columns that have only 1 unique value (features add no information) for col in df.columns: if len(np.unique(df[col])) == 1: df.drop(columns=col, inplace=True) # drop columns with low correlation with class and higher (over 0.8) # correlation with other attributes df.drop(columns=['gdenergy', 'maxenergy'], inplace=True) # save to CSV save_dataset(df, 'seismic-bumps.csv', sep=',', subdir=tdir)
def combine_datasets(df): """Creates a combined dataset for error and accuracy to compare various optimization algorithms and saves it as a CSV file. Args: dfs (dict(Pandas.DataFrame)): Dictionary of data frames. """ # create combined error datasets BP = df['BP'] RHC = datafiles['RHC'] SA = datafiles['SA'] GA = datafiles['GA'] # rename columns bCols = { 'MSE_train': 'bp_msetrain', 'MSE_test': 'bp_msetest', 'MSE_validation': 'bp_msevalid', 'acc_train': 'bp_acctrain', 'acc_test': 'bp_acctest', 'acc_validation': 'bp_accvalid', 'seconds_elapsed': 'bp_time' } rCols = { 'MSE_train': 'rhc_msetrain', 'MSE_test': 'rhc_msetest', 'MSE_validation': 'rhc_msevalid', 'acc_train': 'rhc_acctrain', 'acc_test': 'rhc_acctest', 'acc_validation': 'rhc_accvalid', 'seconds_elapsed': 'rhc_time' } sCols = { 'MSE_train': 'sa_msetrain', 'MSE_test': 'sa_msetest', 'MSE_validation': 'sa_msevalid', 'acc_train': 'sa_acctrain', 'acc_test': 'sa_acctest', 'acc_validation': 'sa_accvalid', 'seconds_elapsed': 'sa_time' } gCols = { 'MSE_train': 'ga_msetrain', 'MSE_test': 'ga_msetest', 'MSE_validation': 'ga_msevalid', 'acc_train': 'ga_acctrain', 'acc_test': 'ga_acctest', 'acc_validation': 'ga_accvalid', 'seconds_elapsed': 'ga_time' } BP = df['BP'].rename(index=str, columns=bCols) RHC = df['RHC'].drop(columns='iteration').rename(index=str, columns=rCols) SA = df['SA'].drop(columns='iteration').rename(index=str, columns=sCols) GA = df['GA'].drop(columns='iteration').rename(index=str, columns=gCols) # create combined datasets res = pd.concat([BP, RHC, SA, GA], axis=1) save_dataset(res, filename='combined.csv', subdir='results/NN/combined')
def preprocess_digits(): tdir = 'data\experiments' digits = datasets.load_digits() X = digits.data y = digits.target data = pd.DataFrame(X) data['class'] = pd.DataFrame(y) data = data.sample(frac=1).reset_index(drop=True) print("Number of samples: %d" % len(data)) save_dataset(data, 'digits.csv', sep=',', subdir=tdir)
def preprocess_abalone(): """Cleans and generates abalone dataset for experiments as a CSV file. Uses one-hot encoding for categorical features. """ sdir = 'data\\abalone' tdir = 'data\experiments' abalone_file = get_abspath('abalone.csv', sdir) column_names = ["sex", "length", "diameter", "height", "whole weight", "shucked weight", "viscera weight", "shell weight", "rings"] data = pd.read_csv(abalone_file, names=column_names) print("Number of samples (Abalone): %d" % len(data)) from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X = LabelEncoder() data.iloc[:, 0] = labelencoder_X.fit_transform(data.iloc[:, 0]) onehotencoder = OneHotEncoder(categorical_features = [0]) data = onehotencoder.fit_transform(data).toarray() data = pd.DataFrame(data, columns =["female", "infant", "male", "length", "diameter", "height", "whole weight", "shucked weight", "viscera weight", "shell weight", "rings"] ) for col in data.columns: if len(np.unique(data[col])) == 1: data.drop(columns=col, inplace=True) # FOR 3 CLASSES for index, row in data.iterrows(): if (row['rings'] < 7 ): row['rings'] = 0 elif (row['rings'] >= 7 and (row['rings'] <= 13)): row['rings'] = 1 elif (row['rings'] > 13): row['rings'] = 2 # data.rename(columns={'rings': 'class'}, inplace=True) # save_dataset(data, 'abalone-2.csv', sep=',', subdir=tdir) # for 30 classes comment out when using 3 classes save_dataset(data, 'abalone.csv', sep=',', subdir=tdir) # for 3 classes
def get_splits(X, y, filepath='data/experiments'): """ Splits X and y datasets into training, validation, and test data sets and then saves them as CSVs Args: X (Numpy.Array): Attributes. y (Numpy.Array): Classes. filepath (str): Output folder. """ # get train and test splits X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y) # split out validation dataset (emulates cross-validation) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0, stratify=y_train) # combine datasets np_train = np.concatenate((X_train, y_train[:, np.newaxis]), axis=1) train = pd.DataFrame(np_train) np_test = np.concatenate((X_test, y_test[:, np.newaxis]), axis=1) test = pd.DataFrame(np_test) np_val = np.concatenate((X_val, y_val[:, np.newaxis]), axis=1) validation = pd.DataFrame(np_val) # save datasets to CSV output_path = 'data/experiments' save_dataset(train, 'seismic_train.csv', subdir=output_path, header=False) save_dataset(test, 'seismic_test.csv', subdir=output_path, header=False) save_dataset(validation, 'seismic_validation.csv', subdir=output_path, header=False)
return X_train, X_test, y_train, y_test if __name__ == '__main__': print('Processing \n') tdir = 'data' mldata_dir = os.path.join(os.getcwd(), os.pardir, tdir) mnist = fetch_mldata('MNIST original', data_home=mldata_dir) y = pd.Series(mnist.target).astype('int') X = pd.DataFrame(mnist.data) X.loc[:, 'class'] = y X = (X.loc[X['class'].isin([1, 3, 5])]).\ groupby('class', group_keys=False).\ apply(lambda x: x.sample(min(len(x), 1000))) save_dataset(X, 'digits.csv', sep=',', subdir=tdir) df = pd.read_csv( 'http://archive.ics.uci.edu/ml/machine-learning-databases/cmc/cmc.data', # noqa header=None) df.rename(columns={9: 'class'}, inplace=True) save_dataset(df, 'contraceptive.csv', sep=',', subdir=tdir) df = pd.read_csv( 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data', # noqa header=None) def map_values(value): if value < 9: return 0
return 2 if __name__ == '__main__': print('Processing \n') tdir = 'data' df_abalone = pd.read_csv('../data/abalone.csv') X_train, X_test, y_train, y_test = split_data(df_abalone) train_df = pd.DataFrame(np.append(X_train, y_train.reshape(-1, 1), 1)) test_df = pd.DataFrame(np.append(X_test, y_test.reshape(-1, 1), 1)) print('SHAPE OF test') print(test_df.shape) print('SHAPE OF Train') print(train_df.shape) save_dataset(train_df, 'abalone_train.csv', sep=',', subdir=tdir, header=False) save_dataset(test_df, 'abalone_test.csv', sep=',', subdir=tdir, header=False)