Example #1
0
def get_splits(X, y, dname, filepath='data/experiments'):
    """ Splits X and y datasets into training, validation, and test data sets
    and then saves them as CSVs

    Args:
        X (Numpy.Array): Attributes.
        y (Numpy.Array): Classes.
        dname (str): Dataset name.
        filepath (str): Output folder.

    """
    # get train and test splits
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=0, stratify=y)

    # combine datasets
    np_train = np.concatenate((X_train, y_train[:, np.newaxis]), axis=1)
    train = pd.DataFrame(np_train)

    np_test = np.concatenate((X_test, y_test[:, np.newaxis]), axis=1)
    test = pd.DataFrame(np_test)

    # save datasets to CSV
    output_path = 'data/experiments'
    trainfile = '{}_train.csv'.format(dname)
    testfile = '{}_test.csv'.format(dname)
    save_dataset(train, trainfile, subdir=output_path, header=False)
    save_dataset(test, testfile, subdir=output_path, header=False)
def preprocess_winequality():
    """Cleans and generates wine quality dataset for experiments as a
    CSV file.

    """
    # get file paths
    sdir = 'data/winequality'
    tdir = 'data/experiments'
    wr_file = get_abspath('winequality-red.csv', sdir)
    ww_file = get_abspath('winequality-white.csv', sdir)

    # load as data frame
    wine_red = pd.read_csv(wr_file, sep=';')
    wine_white = pd.read_csv(ww_file, sep=';')

    # encode artifical label to determine if wine is red or not
    wine_red['red'] = 1
    wine_white['red'] = 0

    # combine datasets and format column names
    df = wine_red.append(wine_white)
    df.columns = ['_'.join(col.split(' ')) for col in df.columns]
    df.rename(columns={'quality': 'class'}, inplace=True)

    # save to CSV
    save_dataset(df, 'winequality.csv', sep=',', subdir=tdir)
Example #3
0
def get_cluster_data(X, y, name, km_k, gmm_k, rdir, perplexity=30):
    """Generates 2D dataset that contains cluster labels for K-Means and GMM,
    as well as the class labels for the given dataset.

    Args:
        X (Numpy.Array): Attributes.
        y (Numpy.Array): Labels.
        name (str): Dataset name.
        perplexity (int): Perplexity parameter for t-SNE.
        km_k (int): Number of clusters for K-Means.
        gmm_k (int): Number of components for GMM.
        rdir (str): Folder to save results CSV.

    """
    # generate 2D X dataset
    X2D = TSNE(n_iter=5000, perplexity=perplexity).fit_transform(X)

    # get cluster labels using best k
    km = KMeans(random_state=0).set_params(n_clusters=km_k)
    gmm = GMM(random_state=0).set_params(n_components=gmm_k)
    km_cl = np.atleast_2d(km.fit(X2D).labels_).T
    gmm_cl = np.atleast_2d(gmm.fit(X2D).predict(X2D)).T
    y = np.atleast_2d(y).T

    # create concatenated dataset
    cols = ['x1', 'x2', 'km', 'gmm', 'class']
    df = pd.DataFrame(np.hstack((X2D, km_cl, gmm_cl, y)), columns=cols)

    # save as CSV
    filename = '{}_2D.csv'.format(name)
    save_dataset(df, filename, sep=',', subdir=rdir, header=True)
def preprocess_seismic():
    """Cleans and generates seismic bumps dataset for experiments as a
    CSV file. Uses one-hot encoding for categorical features.

    """
    # get file path
    sdir = 'data/seismic-bumps'
    tdir = 'data/experiments'
    seismic_file = get_abspath('seismic-bumps.arff', sdir)

    # read arff file and convert to record array
    rawdata = arff.loadarff(seismic_file)
    df = pd.DataFrame(rawdata[0])

    # apply one-hot encoding to categorical features using Pandas get_dummies
    cat_cols = ['seismic', 'seismoacoustic', 'shift', 'ghazard']
    cats = df[cat_cols]
    onehot_cols = pd.get_dummies(cats, prefix=cat_cols)

    # drop original categorical columns and append one-hot encoded columns
    df.drop(columns=cat_cols, inplace=True)
    df = pd.concat((df, onehot_cols), axis=1)

    # drop columns that have only 1 unique value (features add no information)
    for col in df.columns:
        if len(np.unique(df[col])) == 1:
            df.drop(columns=col, inplace=True)

    # drop columns with low correlation with class and higher (over 0.8)
    # correlation with other attributes
    df.drop(columns=['gdenergy', 'maxenergy'], inplace=True)

    # save to CSV
    save_dataset(df, 'seismic-bumps.csv', sep=',', subdir=tdir)
Example #5
0
def combine_datasets(df):
    """Creates a combined dataset for error and accuracy to compare various
    optimization algorithms and saves it as a CSV file.

    Args:
        dfs (dict(Pandas.DataFrame)): Dictionary of data frames.

    """
    # create combined error datasets
    BP = df['BP']
    RHC = datafiles['RHC']
    SA = datafiles['SA']
    GA = datafiles['GA']

    # rename columns
    bCols = {
        'MSE_train': 'bp_msetrain',
        'MSE_test': 'bp_msetest',
        'MSE_validation': 'bp_msevalid',
        'acc_train': 'bp_acctrain',
        'acc_test': 'bp_acctest',
        'acc_validation': 'bp_accvalid',
        'seconds_elapsed': 'bp_time'
    }
    rCols = {
        'MSE_train': 'rhc_msetrain',
        'MSE_test': 'rhc_msetest',
        'MSE_validation': 'rhc_msevalid',
        'acc_train': 'rhc_acctrain',
        'acc_test': 'rhc_acctest',
        'acc_validation': 'rhc_accvalid',
        'seconds_elapsed': 'rhc_time'
    }
    sCols = {
        'MSE_train': 'sa_msetrain',
        'MSE_test': 'sa_msetest',
        'MSE_validation': 'sa_msevalid',
        'acc_train': 'sa_acctrain',
        'acc_test': 'sa_acctest',
        'acc_validation': 'sa_accvalid',
        'seconds_elapsed': 'sa_time'
    }
    gCols = {
        'MSE_train': 'ga_msetrain',
        'MSE_test': 'ga_msetest',
        'MSE_validation': 'ga_msevalid',
        'acc_train': 'ga_acctrain',
        'acc_test': 'ga_acctest',
        'acc_validation': 'ga_accvalid',
        'seconds_elapsed': 'ga_time'
    }

    BP = df['BP'].rename(index=str, columns=bCols)
    RHC = df['RHC'].drop(columns='iteration').rename(index=str, columns=rCols)
    SA = df['SA'].drop(columns='iteration').rename(index=str, columns=sCols)
    GA = df['GA'].drop(columns='iteration').rename(index=str, columns=gCols)

    # create combined datasets
    res = pd.concat([BP, RHC, SA, GA], axis=1)
    save_dataset(res, filename='combined.csv', subdir='results/NN/combined')
Example #6
0
def preprocess_digits():
    tdir = 'data\experiments'
    digits = datasets.load_digits()
    X = digits.data  
    y = digits.target
    data = pd.DataFrame(X)
    data['class'] = pd.DataFrame(y)
    data = data.sample(frac=1).reset_index(drop=True)
    print("Number of samples: %d" % len(data))
    save_dataset(data, 'digits.csv', sep=',', subdir=tdir)
Example #7
0
def preprocess_abalone():
    """Cleans and generates abalone dataset for experiments as a
    CSV file. Uses one-hot encoding for categorical features.
    """

    sdir = 'data\\abalone'
    tdir = 'data\experiments'
    abalone_file = get_abspath('abalone.csv', sdir)

    column_names = ["sex", "length", "diameter", "height", "whole weight", 
                "shucked weight", "viscera weight", "shell weight", "rings"]
    data = pd.read_csv(abalone_file, names=column_names)
    print("Number of samples (Abalone): %d" % len(data))

    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
    labelencoder_X = LabelEncoder()
    data.iloc[:, 0] = labelencoder_X.fit_transform(data.iloc[:, 0])
    onehotencoder = OneHotEncoder(categorical_features = [0])
    data = onehotencoder.fit_transform(data).toarray()
    data = pd.DataFrame(data, columns =["female", "infant", "male", "length", "diameter", "height", "whole weight", 
                "shucked weight", "viscera weight", "shell weight", "rings"] )
    
    for col in data.columns:
        if len(np.unique(data[col])) == 1:
            data.drop(columns=col, inplace=True)
    
    # FOR 3 CLASSES
    for index, row in data.iterrows():
        if (row['rings'] < 7 ):
            row['rings'] = 0
        elif (row['rings'] >= 7 and (row['rings'] <= 13)):
            row['rings'] = 1
        elif (row['rings'] > 13):
            row['rings'] = 2
# 
    data.rename(columns={'rings': 'class'}, inplace=True)
#    save_dataset(data, 'abalone-2.csv', sep=',', subdir=tdir) # for 30 classes comment out when using 3 classes
    save_dataset(data, 'abalone.csv', sep=',', subdir=tdir) # for 3 classes
Example #8
0
def get_splits(X, y, filepath='data/experiments'):
    """ Splits X and y datasets into training, validation, and test data sets
    and then saves them as CSVs

    Args:
        X (Numpy.Array): Attributes.
        y (Numpy.Array): Classes.
        filepath (str): Output folder.

    """
    # get train and test splits
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0,
                                                        stratify=y)

    # split out validation dataset (emulates cross-validation)
    X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                      y_train,
                                                      test_size=0.2,
                                                      random_state=0,
                                                      stratify=y_train)

    # combine datasets
    np_train = np.concatenate((X_train, y_train[:, np.newaxis]), axis=1)
    train = pd.DataFrame(np_train)

    np_test = np.concatenate((X_test, y_test[:, np.newaxis]), axis=1)
    test = pd.DataFrame(np_test)

    np_val = np.concatenate((X_val, y_val[:, np.newaxis]), axis=1)
    validation = pd.DataFrame(np_val)

    # save datasets to CSV
    output_path = 'data/experiments'
    save_dataset(train, 'seismic_train.csv', subdir=output_path, header=False)
    save_dataset(test, 'seismic_test.csv', subdir=output_path, header=False)
    save_dataset(validation,
                 'seismic_validation.csv',
                 subdir=output_path,
                 header=False)
Example #9
0
    return X_train, X_test, y_train, y_test


if __name__ == '__main__':
    print('Processing \n')
    tdir = 'data'
    mldata_dir = os.path.join(os.getcwd(), os.pardir, tdir)
    mnist = fetch_mldata('MNIST original', data_home=mldata_dir)
    y = pd.Series(mnist.target).astype('int')
    X = pd.DataFrame(mnist.data)
    X.loc[:, 'class'] = y
    X = (X.loc[X['class'].isin([1, 3, 5])]).\
        groupby('class', group_keys=False).\
        apply(lambda x: x.sample(min(len(x), 1000)))

    save_dataset(X, 'digits.csv', sep=',', subdir=tdir)

    df = pd.read_csv(
        'http://archive.ics.uci.edu/ml/machine-learning-databases/cmc/cmc.data',  # noqa
        header=None)

    df.rename(columns={9: 'class'}, inplace=True)
    save_dataset(df, 'contraceptive.csv', sep=',', subdir=tdir)

    df = pd.read_csv(
        'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data',  # noqa
        header=None)

    def map_values(value):
        if value < 9:
            return 0
Example #10
0
    return 2


if __name__ == '__main__':
    print('Processing \n')
    tdir = 'data'

    df_abalone = pd.read_csv('../data/abalone.csv')

    X_train, X_test, y_train, y_test = split_data(df_abalone)

    train_df = pd.DataFrame(np.append(X_train, y_train.reshape(-1, 1), 1))
    test_df = pd.DataFrame(np.append(X_test, y_test.reshape(-1, 1), 1))

    print('SHAPE OF test')
    print(test_df.shape)

    print('SHAPE OF Train')
    print(train_df.shape)

    save_dataset(train_df,
                 'abalone_train.csv',
                 sep=',',
                 subdir=tdir,
                 header=False)
    save_dataset(test_df,
                 'abalone_test.csv',
                 sep=',',
                 subdir=tdir,
                 header=False)