Ejemplo n.º 1
0
def preprocessing(problem):

    path = os.path.join('datasets', '{}.csv'.format(problem))
    data = np.genfromtxt(path, delimiter=',')

    inputs = data[:, :-1]
    labels = data[:, -1]

    n_classes = len(np.unique(labels))
    n_dims = inputs.shape[1]

    # one-hot code targets
    if np.min(labels) != 0:
        labels -= 1  # need dummy code to start at zero for this to work
    labels = labels.astype(int)
    labels = np.eye(n_classes)[np.array(labels)]

    # norm data to be between -1 and 1
    if problem[:-1] != 'shj':
        inputs -= np.min(inputs, axis=0)
        inputs /= np.ptp(inputs, axis=0)
        inputs *= 2
        inputs -= 1
        full_set = np.append(inputs, labels, 1)
    else:
        full_set = np.append(inputs, labels, 1)
        full_set = np.concatenate((full_set, full_set),
                                  axis=0)  # to match Nosofsky+ '94

    return [full_set, n_classes, n_dims]
Ejemplo n.º 2
0
def organize_data_from_txt(data_filepath, delimiter=','):
    data = np.genfromtxt(data_filepath, delimiter=delimiter)

    data = {
        'inputs': data[:, :-1],
        'labels': data[:, -1],
        'categories': np.unique(data[:, -1]),
    }

    # map categories to label indices
    data['idx_map'] = {
        category: idx
        for category, idx in zip(data['categories'],
                                 range(len(data['categories'])))
    }

    # map original labels to label indices
    data['labels_indexed'] = [
        data['idx_map'][label] for label in data['labels']
    ]

    # generate one hot targets
    data['one_hot_targets'] = np.eye(len(
        data['categories']))[data['labels_indexed']]

    return data
Ejemplo n.º 3
0
def generate_test_and_train(path,
                            min_table_size,
                            max_table_size,
                            train_test_split,
                            excluded_table_sizes=[]):
    runtime_data = np.genfromtxt(path, delimiter=',')
    # remove header and operator column
    runtime_data = runtime_data[1:, 0:6]

    # 0         1           2           3          4          5
    # tablerows,tablesizekb,selectivity,tuplewidth,attrInPred,runtime,operator
    runtime_data = runtime_data[(runtime_data[:, 1] >= min_table_size)
                                & (runtime_data[:, 1] <= max_table_size)]

    for excluded_table_size in excluded_table_sizes:
        runtime_data = runtime_data[runtime_data[:, 1] != excluded_table_size]

    # usual train test split
    np.random.seed(42)
    np.random.shuffle(runtime_data)

    size_split_point = int(len(runtime_data) * train_test_split)
    test_data = runtime_data[size_split_point:]
    training_data = runtime_data[:size_split_point]

    return training_data, test_data
Ejemplo n.º 4
0
 def plotTrainingCurve():
     X = np.genfromtxt(r'dkfTrainTest.csv', delimiter=',')
     t = np.arange(X.shape[0])
     plt.clf()
     plt.plot(t, X)
     #plt.plot(t,X[:,1])
     #plt.legend(['Train', 'Test'])
     plt.savefig('trainingCurvedkf.jpg')
Ejemplo n.º 5
0
 def load_data(self):
     # load data
     data = np.matrix(
         np.genfromtxt(
             '../../mlrefined_datasets/superlearn_datasets/bacteria_data.csv',
             delimiter=','))
     self.x = np.asarray(data[:, 0])
     self.y = np.asarray(data[:, 1])
Ejemplo n.º 6
0
def get_data():                                                            #get all data input and output from the csv file
    all_data=np.genfromtxt("Data_for_UCI_named.csv", delimiter=",")
    data=all_data[1:,:-1]
    n=np.size(data,0)
    for i in range(n):
        if data[i,-1]<0:
            data[i,-1]=0
        else:
            data[i,-1]=1
    return data
Ejemplo n.º 7
0
def get_data():
    all_data=np.genfromtxt("Data_for_UCI_named.csv", delimiter=",")
    data=all_data[1:,:-1]
    n=np.size(data,0)
#    df = pd.read_csv("Data_for_UCI_named.csv")
#    saved_column = df['stabf'] 
    #y=np.zeros([n,1])
    bias_ones=np.ones([n,1])
    data=np.concatenate((bias_ones, data), axis=1)
    for i in range(n):
        if data[i,-1]<0:
            data[i,-1]=0
        else:
            data[i,-1]=1
    return data
Ejemplo n.º 8
0
def load_P(P_loc):
	P_all_inv = np.genfromtxt(P_loc, delimiter=',')

	# input motion parameters are assumed to be not based on sampling
	# but based on coordinate transformation, therefore neeed to invert

	H_all_inv = p_to_H(np.expand_dims(P_all_inv, axis=2))

	H_all = np.linalg.inv(H_all_inv)

	P_all = H_to_p(H_all)

	P_init = P_all[0, :]
	P = P_all[1:, :]

	P_init = np.expand_dims(P_init, axis=0)

	return P_init, P
Ejemplo n.º 9
0
def getdata():

    my_data = np.genfromtxt('Data_for_UCI_named.csv', delimiter=',')
    my_data = my_data[1:10001, :]

    mydatasize = np.size(my_data, 0)

    tdata_size = (2 * mydatasize) // 3

    t_data = my_data[0:tdata_size, :]

    x_star = t_data[:, 0:12]

    y = np.ceil(t_data[:, 12][:, None])

    shape = x_star.shape
    N = shape[0]

    return x_star, y, N, my_data, tdata_size
Ejemplo n.º 10
0
            bottom = x[j, 2 * i] * w[i] + x[j, 2 * i + 1] * (1 - w[i])
            currout.append(top)
            currout.append(bottom)
        out.append(currout)
    out = np.asarray(out)
    return out


# In[11]:

w_init = scatter_layer_weights(1)
w_init

# In[12]:

X = np.genfromtxt('data/scatter02_T10_all_in.csv', delimiter=',').T
y = np.genfromtxt('data/scatter02_T10_all_out.csv', delimiter=',').T
print(X.shape)
print(y.shape)

# In[25]:


# our "predict" function
def propagate(x, w):
    out = scatter_layer(x, w)
    #print("\nScatter: 1")
    for i in range(9):
        out = prop_layer(out)
        out = scatter_layer(out, w)
        #print("Scatter: " + str(i+2))
Ejemplo n.º 11
0
def load_data(filename, sample=True):
    Y = np.genfromtxt(filename, delimiter=",")
    Y = ((Y.T - Y.T.mean(axis=0)) / (Y.T.std(axis=0))).T
    return Y[:, ::100] if sample else Y
Ejemplo n.º 12
0
#                     forward(params, inputs = inputs, channels = channels, hps = hps)[-1]
#                 )
#             ),
#             axis = 2, keepdims = True
#         ),
#         axis = 0
#     )[:,0]

# - - - - - - - - - - - - - - - - - -

if __name__ == '__main__':
    import utils

    # data = np.genfromtxt('iris.csv', delimiter = ',')
    # data = np.genfromtxt('mamm.csv', delimiter = ',')
    data = np.genfromtxt('leaf.csv', delimiter=',')

    inputs = data[:, :-1]
    labels = data[:, -1]

    categories = np.unique(labels)
    idx_map = {
        category: idx
        for category, idx in zip(categories, range(len(categories)))
    }
    labels_indexed = [idx_map[label] for label in labels]
    one_hot_targets = np.eye(len(categories))[labels_indexed]

    hps = {
        'lr': .05,  # <-- learning rate
        'wr': [-.1, .1],  # <-- weight range
Ejemplo n.º 13
0

if __name__ == "__main__":
    N = 10000
    M = 12
    learning_rate = 1e-3
    iter_num = 20000

    batch_size = 32

    training_size = int(N * (3 / 3))

    X = np.zeros((N, 12))

    csv_file = 'Data_for_UCI_named.csv'
    X_0 = np.genfromtxt(csv_file, delimiter=',')

    X = X_0[1:, 0:12]
    Y = np.zeros(np.shape(X)[0])

    Y_0 = np.genfromtxt(csv_file, delimiter=',', usecols=(-1),
                        dtype=np.str)[1:]

    bad_chars = '"' ''
    for i in range(0, np.shape(Y_0)[0]):
        s = Y_0[i]
        for c in bad_chars:
            s = s.replace(c, "")
        if s == "unstable":
            Y[i] = 0
        else:
Ejemplo n.º 14
0

if __name__ == '__main__':
    random = 1 
    
    n_samples = 10 
    n_samples_to_test = 100
    num_pseudo_params = 50 

    dimensions =[1,1,1]
    n_layers = len(dimensions)-1 

    npr.seed(0) #Randomness comes from KMeans
    rs = npr.RandomState(0)

    motor = np.genfromtxt('motor.csv', delimiter=',',skip_header = True)
    X = motor[:,1]
    X = (X - np.mean(X))/(np.std(X))
    X = X.reshape(len(X),1)

    y = motor[:,2]
    y = (y-np.mean(y))/(np.std(y))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

    total_num_params, log_likelihood, sample_mean_cov_from_deep_gp, predict_layer_funcs, squared_error, create_deep_map = \
            build_deep_gp(dimensions, rbf_covariance, num_pseudo_params, random)

    init_params = .1 * npr.randn(total_num_params)
    deep_map = create_deep_map(init_params)

    init_params = initialize(deep_map,X,num_pseudo_params)
Ejemplo n.º 15
0
runEstimates = True
computeCounterfactuals = False

data_dir_base = projectFiles + "data/"
results_dir_base = projectFiles + "results/"

dataPath = data_dir_base + size
resultsPath = results_dir_base + size

estimatesPath = resultsPath + "estimates/"
counterfactualsPath = resultsPath + "counterfactuals/"

estimatesPath = resultsPath + "estimates/"

# Economic Parameters
beta = np.genfromtxt(dataPath + 'beta.csv', delimiter=',')
theta = np.genfromtxt(dataPath + 'theta.csv', delimiter=',')
mu = np.genfromtxt(dataPath + 'mu.csv', delimiter=',')
nu = np.genfromtxt(dataPath + 'nu.csv', delimiter=',')

params = {"beta": beta, "theta": theta, "mu": mu, "nu": nu}

# Data
tau = np.genfromtxt(dataPath + 'tau.csv', delimiter=',')
Xcif = np.genfromtxt(dataPath + 'Xcif.csv', delimiter=',')
Y = np.genfromtxt(dataPath + 'Y.csv', delimiter=',')
Eq = np.genfromtxt(dataPath + 'Eq.csv', delimiter=',')
Ex = np.genfromtxt(dataPath + 'Ex.csv', delimiter=',')
r = np.genfromtxt(dataPath + 'r.csv', delimiter=',')
D = np.genfromtxt(dataPath + 'D.csv', delimiter=',')
ccodes = np.genfromtxt(dataPath + 'ccodes.csv', delimiter=',', dtype="str")
Ejemplo n.º 16
0
    def read_housing_csv(self, file_name, mapping_state, target_name=None):
        data = np.genfromtxt(file_name,
                             delimiter=',',
                             dtype='unicode',
                             skip_header=1)
        if (self.config.NN_DEBUG_SHAPES):
            print(data.shape)
        if (target_name is None):
            skip_cols = 1
        else:
            skip_cols = 2
        # clean up feature-wise
        map_id = 1.0  # Dont make a feature irrelevant by making it 0
        # Map known mappings
        mapping_state["NA"] = 0.0
        mapping_state["No"] = 0.0
        mapping_state["N"] = 0.0
        mapping_state["Unf"] = 0.0
        mapping_state["None"] = 0.0
        mapping_state["Po"] = 0.0  # Poor
        mapping_state["Y"] = map_id
        map_id = map_id + 1
        mapping_state["Fa"] = map_id
        map_id = map_id + 1
        mapping_state["TA"] = map_id
        map_id = map_id + 1
        mapping_state["Gd"] = map_id
        map_id = map_id + 1
        mapping_state["Ex"] = map_id
        map_id = map_id + 1

        # Get (samplesize x features per sample)
        X = np.empty(
            (data.shape[0],
             data.shape[1] - skip_cols))  # Dont need Id and Price columns
        # Perform column-wise, so feature-wise mappings are similar, else they will be random
        for col in range(data.shape[1] - skip_cols):
            for row in range(data.shape[0]):
                try:
                    X[row][col] = data[row][col + 1].astype(float)
                except:
                    if (data[row][col + 1] in mapping_state):
                        X[row][col] = mapping_state[data[row][col + 1]]
                    else:
                        mapping_state[data[row][col + 1]] = map_id
                        X[row][col] = map_id
                        map_id = map_id + 1.0
        # Get groundtruths
        Y = np.empty((data.shape[0], 1))
        if (target_name is not None):
            for row in range(data.shape[0]):
                col = data.shape[1] - 1
                try:
                    # Take log of saleprice to match the loss calculations
                    Y[row][0] = data[row][col].astype(float)
                except:
                    raise Exception("Ground truth should be float")
        # Normalize
        Y_normalize_state = X_normalize_state = None
        if (self.config.NN_NORMALIZE):
            if (target_name is not None):
                Y, Y_normalize_state = self.utils.normalize0(Y, axis=0)
            X, X_normalize_state = self.utils.normalize0(X, axis=0)
        if (self.config.NN_DEBUG_SHAPES):
            print(X.shape, Y.shape, X, X[0][0].dtype)
        return X, X_normalize_state, mapping_state, Y, Y_normalize_state
Ejemplo n.º 17
0

if __name__ == '__main__':
    random = 1

    n_samples = 10
    n_samples_to_test = 100
    num_pseudo_params = 50

    dimensions = [1, 1, 1]
    n_layers = len(dimensions) - 1

    npr.seed(0)  #Randomness comes from KMeans
    rs = npr.RandomState(0)

    motor = np.genfromtxt('motor.csv', delimiter=',', skip_header=True)
    X = motor[:, 1]
    X = (X - np.mean(X)) / (np.std(X))
    X = X.reshape(len(X), 1)

    y = motor[:, 2]
    y = (y - np.mean(y)) / (np.std(y))
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    total_num_params, log_likelihood, sample_mean_cov_from_deep_gp, predict_layer_funcs, squared_error, create_deep_map = \
            build_deep_gp(dimensions, rbf_covariance, num_pseudo_params, random)

    init_params = .1 * npr.randn(total_num_params)
Ejemplo n.º 18
0
import autograd.numpy as np
import scipy.optimize
import random
import sys
from autograd import grad

if len(sys.argv) != 5:
    print("args: <trainFile> <trainLabelsFile> <testFile> <testLabelsFile>")
    exit(1)

trainFile = sys.argv[1]
trainLabelsFile = sys.argv[2]
testFile = sys.argv[3]
testLabelsFile = sys.argv[4]

trainData = np.genfromtxt(trainFile, delimiter=',', dtype=np.float64)
trainLabels = np.genfromtxt(trainLabelsFile, delimiter=',', dtype=np.float64)
testData = np.genfromtxt(testFile, delimiter=',', dtype=np.float64)
testLabels = np.genfromtxt(testLabelsFile, delimiter=',', dtype=np.float64)


def f(theta):
    objReg = 0.5 / 2.0 * np.dot(theta[1:], theta[1:])
    sigmoids = 1.0 / (1.0 + np.exp(
        np.minimum(300.0, -(theta[0] + np.matmul(trainData, theta[1:])))))
    innerSecondTerm = 1.0 - trainLabels + np.multiply(
        sigmoids, (2.0 * trainLabels - 1.0))
    result = np.sum(np.log(innerSecondTerm + 1e-10))
    return objReg - result

Ejemplo n.º 19
0
students = 5  # Each student receives different itiailized weights
lessons = 20000  # Each lesson consists of the entire training set

iris_data = load_iris()  # load the iris dataset

x = iris_data.data
y_ = iris_data.target.reshape(-1, 1)  # Convert data to a single column

# One Hot encode the class labels
encoder = OneHotEncoder(sparse=False)
y = encoder.fit_transform(y_)

# # Split the data for training and testing
# train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.20)

initial_weights = np.genfromtxt(fname='inputs/SF5d_5.dat')
#initial_weights = np.genfromtxt(fname='inputs/keras_inputs.txt')

train_x = np.genfromtxt(fname='same_split/train_x.txt')
train_y = np.genfromtxt(fname='same_split/train_y.txt')
test_x = np.genfromtxt(fname='same_split/test_x.txt')
test_y = np.genfromtxt(fname='same_split/test_y.txt')

train_x = np.delete(np.delete(train_x, 0, 1), 0, 1)
test_x = np.delete(np.delete(test_x, 0, 1), 0, 1)
x = np.delete(np.delete(x, 0, 1), 0, 1)

#Standardization
train_x[:,
        0] = (train_x[:, 0] - np.mean(train_x[:, 0])) / np.std(train_x[:, 0])
train_x[:,
Ejemplo n.º 20
0
def read_data(filename):
	data = np.genfromtxt(filename, dtype=float, delimiter=',', skip_header=1)
	np.random.shuffle(data) 
	return np.delete(data, [0], axis = 1), data[:,[0]]
Ejemplo n.º 21
0
    def read_housing_csv_2(self, file_name, x_mapping_state, target_name=None):
        skip_header = 1
        if self.config.NN_MULTI_ENCODE_TEXT_VARS or self.config.NN_APPLY_DATA_SCIENCE:
            skip_header = 0
        data = np.genfromtxt(file_name,
                             delimiter=',',
                             dtype='unicode',
                             skip_header=skip_header)
        if (self.config.NN_DEBUG_SHAPES):
            print(data.shape)
        if (target_name is None):
            skip_cols = 1
        else:
            skip_cols = 2

        # Identify Area columns
        area_cols = [i for i, item in enumerate(data[0, :]) if "Area" in item]

        # multi-encode
        if self.config.NN_MULTI_ENCODE_TEXT_VARS:
            X_data, self.neighborhood_vals = multi_encode_text_variables(
                "Neighborhood", data, self.neighborhood_vals)
            X_data = np.delete(X_data, 0, axis=0)  # Remove header now
            data = X_data
        if self.config.NN_APPLY_DATA_SCIENCE:
            # Apply some data science
            data = self.filter_training_data(data, target_name=target_name)
            X_data = self.augment_training_data(data, target_name=target_name)
            X_data = np.delete(X_data, 0, axis=0)  # Remove header now
            data = X_data
        # Get (samplesize x features per sample)
        X = np.empty(
            (data.shape[0],
             data.shape[1] - skip_cols))  # Dont need Id and Price columns
        for col in range(data.shape[1] - skip_cols):
            map_id = 1.0  # reset every feature
            if (target_name is not None):
                mapping_state = {}
            else:
                mapping_state = x_mapping_state[col]
            if col in area_cols:
                # Direct mapping
                for row in range(data.shape[0]):
                    try:
                        X[row][col] = data[row][col + 1].astype(float)
                    except:
                        if (data[row][col + 1] in mapping_state):
                            X[row][col] = mapping_state[data[row][col + 1]]
                        else:
                            mapping_state[data[row][col + 1]] = map_id
                            X[row][col] = map_id
                            map_id = map_id + 1.0
            else:
                for row in range(data.shape[0]):
                    if (data[row][col + 1] in mapping_state):
                        X[row][col] = mapping_state[data[row][col + 1]]
                    else:
                        mapping_state[data[row][col + 1]] = map_id
                        X[row][col] = map_id
                        map_id = map_id + 1.0
            x_mapping_state.append(mapping_state)
        # Get groundtruths
        Y = np.empty((data.shape[0], 1))
        if (target_name is not None):
            prev = 0.0
            for row in range(data.shape[0]):
                col = data.shape[1] - 1
                try:
                    Y[row][0] = data[row][col].astype(float)
                except:
                    raise Exception("Ground truth should be float")
                # Ensure GT was sorted before
                # assert (prev <= Y[row][0])
                prev = Y[row][0]
                if self.config.NN_LOG_TARGET is True:
                    Y[row][0] = np.log(Y[row][0])
        # Normalize
        Y_normalize_state = X_normalize_state = None
        if (self.config.NN_NORMALIZE):
            if (target_name is not None):
                Y, Y_normalize_state = self.utils.normalize0(Y, axis=0)
            X, X_normalize_state = self.utils.normalize0(X, axis=0)
        if (self.config.NN_DEBUG_SHAPES):
            print(X.shape, Y.shape, X, X[0][0].dtype)
        return X, X_normalize_state, x_mapping_state, Y, Y_normalize_state