def create_abalone_data(validation_data_ratio):
    abalone_path = './SampleData/AbaloneData/abalone.data'
    abalone_data = []
    abalone_classifications = []
    # Abalone data sex needs to be mapped
    mapping = {
        'M': 1,
        'F': 2,
        'I': 3,
    }
    with open(abalone_path) as file:
        for entry in file.readlines():
            entry_array = entry.strip('\n').split(',')
            abalone_classifications.append(entry_array[-1])
            mapped_array = []
            for index, element in enumerate(entry_array):
                if index == 0:
                    mapped_array.append(mapping[element])
                else:
                    mapped_array.append(element)
            abalone_data.append(mapped_array)

    X_train, Y_train, X_validate, Y_validate = pg.split_data(
        abalone_data, validation_size=validation_data_ratio)
    X_train = X_train.astype(np.float)
    X_validate = X_validate.astype(np.float)
    Y_train = Y_train.astype(np.float)
    Y_validate = Y_validate.astype(np.float)

    return X_train, Y_train, X_validate, Y_validate
Beispiel #2
0
def get_playground_dataset(Dname, TRAINING_DATA_RATIO=0.8, DATA_NOISE=0.0):

    if Dname == "Circle":
        data = pg.DatasetType.ClassifyCircleData
    elif Dname == "TwoGauss":
        data = pg.DatasetType.ClassifyTwoGaussData
    elif Dname == "Spiral":
        data = pg.DatasetType.ClassifySpiralData
    elif Dname == "XOR":
        data = pg.DatasetType.ClassifyXORData
    else:
        print('[E:dataset.py L18] Argument error [Dname]')
        exit()
    data = pg.generate_data(data, DATA_NOISE)
    data = pg.split_data(data, training_size=TRAINING_DATA_RATIO)
    return data
Beispiel #3
0
def create_iris_data(validation_data_ratio):
    iris_path = './SampleData/IrisData/iris.data'
    iris_data = []
    iris_classifications = []
    with open(iris_path) as file:
        for entry in file.readlines():
            entry_array = entry.strip('\n').split(',')
            iris_classifications.append(entry_array[-1])
            iris_data.append(entry_array)
    unique_classifiers = np.unique(iris_classifications)
    classification_mapping = {}
    for index, classifier in enumerate(unique_classifiers):
        classification_mapping[classifier] = index + 1
    X_train, Y_train, X_validate, Y_validate = pg.split_data(iris_data, validation_size=validation_data_ratio)
    X_train = X_train.astype(np.float)
    X_validate = X_validate.astype(np.float)
    Y_train_mapped = list(map(lambda classification: classification_mapping[classification[0]], Y_train))
    Y_validate_mapped = list(map(lambda classification: classification_mapping[classification[0]], Y_validate))

    return X_train, Y_train_mapped, X_validate, Y_validate_mapped, classification_mapping
Beispiel #4
0
from numpy.linalg import inv
from numpy import array

data_noise=0.0
validation_data_ratio = 0.7

# Generate data
data_array = pg.generate_data(pg.DatasetType.ClassifyCircleData, data_noise)
# data_array = pg.generate_data(pg.DatasetType.ClassifyXORData, data_noise)
# data_array = pg.generate_data(pg.DatasetType.ClassifyTwoGaussData, data_noise)
# data_array = pg.generate_data(pg.DatasetType.ClassifySpiralData, data_noise)
# data_array = pg.generate_data(pg.DatasetType.RegressPlane, data_noise)
# data_array = pg.generate_data(pg.DatasetType.RegressGaussian, data_noise)

# Divide the data for training and validating at a specified ratio
X_train, y_train, X_valid, y_valid = pg.split_data(data_array, validation_size=validation_data_ratio)

# Plot Data
fig, ax = pg.plot_points_with_playground_style(X_train, y_train,X_valid , y_valid, figsize = (6, 6), dpi = 100)
plt.show()

from pandas import DataFrame

train_D = np.concatenate([X_train, y_train] ,axis = 1)

test_D = np.concatenate([X_valid, y_valid],axis = 1)

df = DataFrame(train_D)

# export_csv = df.to_csv (r'C:/Users/Koorosh/Desktop/Paper_Joyjit/PlayD_Circle.csv', index = None, header= None)
import plygdata as pg
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

PROBLEM_DATA_TYPE = pg.DatasetType.ClassifyCircleData
TRAINING_DATA_RATIO = 0.5
DATA_NOISE = 0.0
LEARNING_RATE = 0.03
REGULARIZATION = 0.03
EPOCHS = 100

data_list = pg.generate_data(PROBLEM_DATA_TYPE, DATA_NOISE)
X_train, y_train, X_valid, y_valid = pg.split_data(
    data_list, training_size=TRAINING_DATA_RATIO)

BATCH_SIZE = 15

t_X_train = torch.from_numpy(X_train).float()
t_y_train = torch.from_numpy(y_train).float()
t_X_valid = torch.from_numpy(X_valid).float()
t_y_valid = torch.from_numpy(y_valid).float()

dataset_train = TensorDataset(t_X_train, t_y_train)
dataset_valid = TensorDataset(t_X_valid, t_y_valid)

loader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)
loader_valid = DataLoader(dataset_valid, batch_size=BATCH_SIZE)

Beispiel #6
0
def classify_data_by_point(path, classifiers, linspace, validation_data_ratio=0.2, generate_graphs=True,
                           surface_name='MADGE Surface', title='Classification Data', filename='Test.html',
                           normalization_standard_deviation_factor=6):
    # Make the lower classifier always the first
    if classifiers[0] > classifiers[1]:
        classifiers = (classifiers[1], classifiers[0])
    # Create our training data params
    training_data_set = convert_array_to_array_of_tuples(read_data_from_file(path))
    ## If we want to see it normally
    # Divide the data for training and validating at a specified ratio (further, separate each data into Coordinate point data part and teacher label part)
    X_train, Y_train, X_validate, Y_validate = pg.split_data(training_data_set, validation_size=validation_data_ratio)

    # # This creates the plane with the data we are working with
    new_set = ClassificationSetN()
    x_0_train, y_0_train, z_0_train, x_1_train, y_1_train, z_1_train = [], [], [], [], [], []
    train_label_dim = 2
    train_label_sigma_max = np.zeros(train_label_dim)
    train_label_sigma_min = np.zeros(train_label_dim)
    for [train_x, train_y], classification in zip(X_train, Y_train):
        new_set.add_point(Point(train_x, train_y, classification))
        if classification == classifiers[0]:
            x_0_train.append(train_x)
            y_0_train.append(train_y)
            z_0_train.append(0)
        elif classification == classifiers[1]:
            x_1_train.append(train_x)
            y_1_train.append(train_y)
            z_1_train.append(0)
        if train_x > train_label_sigma_max[0]:
            train_label_sigma_max[0] = train_x
        if train_y < train_label_sigma_min[1]:
            train_label_sigma_min[1] = train_y
    new_set.range_vector = np.subtract(train_label_sigma_max, train_label_sigma_min)  # range(w)
    new_set.normalization_standard_deviation_factor = normalization_standard_deviation_factor
    new_set.range_vector = np.divide(new_set.range_vector, new_set.normalization_standard_deviation_factor)
    if generate_graphs:
        # https://jakevdp.github.io/PythonDataScienceHandbook/04.12-three-dimensional-plotting.html
        x_space = np.linspace(linspace[0], linspace[1], linspace[2])
        y_space = np.linspace(linspace[0], linspace[1], linspace[2])
        X, Y = np.meshgrid(x_space, y_space)
        Z = []
        for x_array, y_array in zip(X, Y):
            z_point = []
            for x_point, y_point in zip(x_array, y_array):
                predicted_point = np.round(new_set.calculate_madge_data_and_map_to_point(Point(x_point, y_point), normalize=True))
                if np.absolute(classifiers[0] - predicted_point) > np.absolute(classifiers[1] - predicted_point):
                    z_point.append(classifiers[1])
                else:
                    z_point.append(classifiers[0])
            Z.append(z_point)
        Z = np.array(Z)

    # This creates the testing data graph data
    x_0_test, y_0_test, z_0_test, x_1_test, y_1_test, z_1_test, x_test, y_test, z_test = \
        [], [], [], [], [], [], [], [], []
    for [test_x, test_y], classification in zip(X_validate, Y_validate):
        x_test.append(test_x)
        y_test.append(test_y)
        z_test.append(np.round(new_set.calculate_madge_data_and_map_to_point(Point(test_x, test_y), normalize=True)))
        if classification == classifiers[0]:
            x_0_test.append(test_x)
            y_0_test.append(test_y)
            z_0_test.append(0)
        elif classification == classifiers[1]:
            x_1_test.append(test_x)
            y_1_test.append(test_y)
            z_1_test.append(0)

    correct_results = 0
    for result, test_result in zip(z_test, Y_validate):
        if result == test_result:
            correct_results += 1
    accuracy = np.divide(correct_results, len(z_test))
    if generate_graphs:
        # Now we generate graphs
        trace_surface = go.Surface(x=X, y=Y, z=Z, name=surface_name, showscale=False)
        trace_scatter_class_a_training = go.Scatter3d(x=x_0_train, y=y_0_train, z=z_0_train, mode='markers',
                                                      name='Training Classifier {}'.format(classifiers[0]),
                                                      marker=dict(
                                                          size=3,
                                                          color='#f29938',
                                                          opacity=1
                                                      ), legendgroup="Group_Train")
        trace_scatter_class_b_training = go.Scatter3d(x=x_1_train, y=y_1_train, z=z_1_train, mode='markers',
                                                      name='Training Classifier {}'.format(classifiers[1]),
                                                      marker=dict(
                                                          size=3,
                                                          color='#257ec0',
                                                          opacity=1
                                                      ), legendgroup="Group_Train")
        # Add plots for testing data
        trace_scatter_class_a_testing = go.Scatter3d(x=x_0_test, y=y_0_test, z=z_0_test, mode='markers',
                                                     name='Testing Classifier {}'.format(classifiers[0]),
                                                     marker=dict(
                                                         size=3,
                                                         color='#f29938',
                                                         opacity=0.4
                                                     ), legendgroup="Group_Test")

        trace_scatter_class_b_testing = go.Scatter3d(x=x_1_test, y=y_1_test, z=z_1_test, mode='markers',
                                                     name='Testing Classifier {}'.format(classifiers[1]),
                                                     marker=dict(
                                                         size=3,
                                                         color='#257ec0',
                                                         opacity=0.4
                                                     ), legendgroup="Group_Test")

        # Append the title name with accuracy
        title = title + "\nAccuracy: {}".format(accuracy)
        data = [trace_surface,
                trace_scatter_class_a_training, trace_scatter_class_b_training,
                trace_scatter_class_a_testing, trace_scatter_class_b_testing]
        fig = go.Figure(data=data)
        fig.update_layout(title=title, autosize=True,
                          width=700, height=700,
                          margin=dict(l=50, r=50, b=65, t=90))
        py.offline.plot(fig, filename=filename)
    else:
        # If we're not generating graphs we will return the accuracy
        return accuracy
Beispiel #7
0
def classify_data(path, classifiers, linspace, validation_data_ratio=0.2, generate_graphs=True,
                  surface_name='MADGE Surface', title='Classification Data', filename='Test.html'):
    # Make the lower classifier always the first
    if classifiers[0] > classifiers[1]:
        classifiers = (classifiers[1], classifiers[0])
    # Create our training data params
    training_data_set = convert_array_to_array_of_tuples(read_data_from_file(path))
    ## If we want to see it normally
    # Divide the data for training and validating at a specified ratio (further, separate each data into Coordinate point data part and teacher label part)
    X_train, Y_train, X_validate, Y_validate = pg.split_data(training_data_set, validation_size=validation_data_ratio)

    # # This creates the plane with the data we are working with
    new_set = ClassificationSet(sigma=1)
    x_0_train, y_0_train, z_0_train, x_1_train, y_1_train, z_1_train = [], [], [], [], [], []
    for [train_x, train_y], classification in zip(X_train, Y_train):
        new_set.add_point(Point(train_x, train_y, classification))
        if classification == classifiers[0]:
            x_0_train.append(train_x)
            y_0_train.append(train_y)
            z_0_train.append(0)
        elif classification == classifiers[1]:
            x_1_train.append(train_x)
            y_1_train.append(train_y)
            z_1_train.append(0)
    x_space = np.linspace(linspace[0], linspace[1], linspace[2])
    y_space = np.linspace(linspace[0], linspace[1], linspace[2])
    X, Y = np.meshgrid(x_space, y_space)
    Z = new_set.calculate_madge_data_and_map_to_plane(X, Y)
    
    ## This creates the testing data graph data
    x_0_test, y_0_test, z_0_test, x_1_test, y_1_test, z_1_test, x_test, y_test = [], [], [], [], [], [], [], []
    for [test_x, test_y], classification in zip(X_validate, Y_validate):
        x_test.append(test_x)
        y_test.append(test_y)
        if classification == classifiers[0]:
            x_0_test.append(test_x)
            y_0_test.append(test_y)
            z_0_test.append(0)
        elif classification == classifiers[1]:
            x_1_test.append(test_x)
            y_1_test.append(test_y)
            z_1_test.append(0)
    # Output an accuracy
    # This will be done via interpolation of the graph
    # We will create an interp function given an x,y array, and output the interpolated vector
    # Input vector will be [X_validation, Y_validation]
    # Our interp function will be the new_set.calculate_madge_data_and_map_to_plane function
    # TODO: is cubic spline 2d interpolation the best to use?
    # Ummm? https://stackoverflow.com/questions/37872171/how-can-i-perform-two-dimensional-interpolation-using-scipy
    # I have no idea what the f**k RBF is but let's just say for now that it works and dear god that's amazing
    f_training_interpolate = interpolate.Rbf(X, Y, Z, function='cubic', smooth=0)
    Z_validate_interpolate = f_training_interpolate(x_test, y_test)
    
    # If the z value is above 0, it is classified as the greater of the two classifications, 
    # if it is below 0, it is classified as the less of the two classifications
    # These classifications are arbitrary
    # We compare these to Y_validate for an accuracy
    def compare_with_zero(value):
        if value > 0:
            return classifiers[1]
        else:
            return classifiers[0]
    test_classification_results = list(map(compare_with_zero, Z_validate_interpolate))
    
    correct_results = 0
    for result, test_result in zip(test_classification_results, Y_validate):
        if result == test_result:
            correct_results += 1
    accuracy = np.divide(correct_results, len(test_classification_results))
    if generate_graphs:
        # Now we generate graphs 
        trace_surface = go.Surface(x=X, y=Y, z=Z, name=surface_name, showscale=False)
        trace_scatter_class_a_training = go.Scatter3d(x=x_0_train, y=y_0_train, z=z_0_train, mode='markers',
                                             name='Training Classifier {}'.format(classifiers[0]), 
                                             marker=dict(
                                                 size=3,
                                                 color='#f29938',    
                                                 opacity=1
                                             ), legendgroup="Group_Train")
        trace_scatter_class_b_training = go.Scatter3d(x=x_1_train, y=y_1_train, z=z_1_train, mode='markers',
                                             name='Training Classifier {}'.format(classifiers[1]),
                                             marker=dict(
                                                 size=3,
                                                 color='#257ec0',    
                                                 opacity=1
                                             ), legendgroup="Group_Train")
        # Add plots for testing data
        trace_scatter_class_a_testing= go.Scatter3d(x=x_0_test, y=y_0_test, z=z_0_test, mode='markers',
                                             name='Testing Classifier {}'.format(classifiers[0]),
                                             marker=dict(
                                                 size=3,
                                                 color='#f29938',    
                                                 opacity=0.4
                                             ), legendgroup="Group_Test")
                                        
        trace_scatter_class_b_testing = go.Scatter3d(x=x_1_test, y=y_1_test, z=z_1_test, mode='markers',
                                             name='Testing Classifier {}'.format(classifiers[1]),
                                             marker=dict(
                                                 size=3,
                                                 color='#257ec0',    
                                                 opacity=0.4
                                             ), legendgroup="Group_Test")
        
        # Append the title name with accuracy
        title = title + "\nAccuracy: {}".format(accuracy)
        data = [trace_surface, 
                trace_scatter_class_a_training, trace_scatter_class_b_training, 
                trace_scatter_class_a_testing, trace_scatter_class_b_testing]
        fig = go.Figure(data=data)
        fig.update_layout(title=title, autosize=True,
                          width=700, height=700,
                          margin=dict(l=50, r=50, b=65, t=90))
        py.offline.plot(fig, filename=filename)
    else:
        # If we're not generating graphs we will return the accuracy
        return accuracy