Exemple #1
0
def init_data():
    df = read_data()
    X = df.loc[:, df.columns != 'defaultPaymentNextMonth'].values
    y = df.loc[:, df.columns == 'defaultPaymentNextMonth'].values

    onehotencoder = OneHotEncoder(categories="auto", sparse=False)
    X = ColumnTransformer(
                [("", onehotencoder, [2, 3]),],
                remainder="passthrough"
            ).fit_transform(X)
    
    scaler = StandardScaler(with_mean=False)
    X = scaler.fit_transform(X)
    
    return X, y
Exemple #2
0
def main():
    np.random.seed(1)
    plt.style.use("bmh")

    dh.generate_data_for_first_func()
    dh.generate_data_for_second_func()

    data = dh.read_data()

    first_label = 'First FFN'
    print(first_label)
    bool_func(data[0], num_on_hidden=4, num_epochs=80, label=first_label)

    second_label = 'Second FFN'
    print(second_label)
    bool_func(data[1], num_on_hidden=8, num_epochs=50, label=second_label)

    plt.show()
def main():
    
    #Read data and plit X and y
    y, X = read_data(s.data_file_path)

    #Implement a linear regressor based on Maximum Likelihood Estimation
    lm_res = mlel.fitLinearRegression(y, X)

    #show linear regressor summary
    print(lm_res.summary())
    
    #Estimating predicted labels
    y_hat = mlel.yhat(X, lm_res)
    
    #Plot y versus y predicted
    p.plot(y, y_hat)
    
    #compute L1
    l1 = mlel.compute_L1(y, y_hat)
    print('L1 error: ', l1[0])
    
    #Compute error between y and y_hat
    error = mlel.error_list(y, y_hat)
    
    #Plot y versus y predicted and error
    p.plot_error(error)
    
    #Bootstraping and we obtain params
    bs_params = bootstrapping.bstrap(s.number_replication, y, X)
    
    #get Means, lower and upper bounds
    means, lower_bounds, upper_bounds = bootstrapping.compute_CI(bs_params)
    
    print('Lower bounds: ', lower_bounds)
    print('Upper bounds:', upper_bounds)
    #Plot Confidence interval
    p.plotCI(np.asarray(bs_params), lower_bounds, upper_bounds)
    
    #Cluster method
    gmm_pred = clustering.gmm_cluster(X, s.n_components)
    #Report
    print(classification_report(y, gmm_pred, target_names=s.target_names))
Exemple #4
0
def load_data(name="corpus", force_refresh=0) -> object:
    data_path = "data"
    output_path = "/content/drive/My Drive/Colab Notebooks/INF8460/Project/output"

    result = ()
    if name == "corpus":
        result = read_data(
            os.path.join(
                data_path,
                "/content/drive/My Drive/Colab Notebooks/INF8460/Project/data/corpus.csv"
            ))

    elif name == "train":
        result = read_questions(
            os.path.join(
                data_path,
                "/content/drive/My Drive/Colab Notebooks/INF8460/Project/data/train_ids.csv"
            ))

    elif name == "validation":
        result = read_questions(
            os.path.join(
                data_path,
                "/content/drive/My Drive/Colab Notebooks/INF8460/Project/data/val_ids.csv"
            ))

    elif name == "test":
        result = read_questions(
            os.path.join(
                data_path,
                "/content/drive/My Drive/Colab Notebooks/INF8460/Project/data/test.csv"
            ))

    else:
        print("error name")

    return result
Exemple #5
0


def generate_heatmap(accuracy, x_range, y_range):
    sbr.heatmap(pd.DataFrame(accuracy), annot=True, cmap="viridis", fmt='g')
    plt.title('Grid-search for logistic regression')
    plt.ylabel('Learning rate: $\\eta$')
    plt.xlabel('Regularization Term: $\\lambda$')
    plt.xticks(ticks=np.arange(len(x_range)) + 0.5, labels=x_range)
    plt.yticks(ticks=np.arange(len(y_range)) + 0.5, labels=y_range)
    plt.show()



if __name__=='__main__':
    df = read_data(filtered=True)
    prediction_target='defaultPaymentNextMonth'


    features = df.loc[:, df.columns != prediction_target].values
    targets = df.loc[:, df.columns == prediction_target].values
    design_matrix = create_design_matrix(features)
    data_train, data_test, targets_train, targets_test = train_test_split(design_matrix, targets, test_size=0.2, shuffle=True)


    search_start, search_end, n_points = -6, 1, 8

    learning_rates = np.logspace(search_start, search_end, n_points)
    lambda_values = np.logspace(search_start, search_end, n_points)
    iterations = 10000
    accuracy = np.zeros((len(learning_rates), len(lambda_values)))
    )
    exit()
train_file = args[0]
test_file = args[1]

filename = train_file
seq_len = 1014  # Fixed length of a sequence of chars, given
num_classes = 14  # Num of categories/concepts, given
init_step_size = 0.01  # Given
max_epochs = 33  # Num of epochs training happens for - arbitarily set to 33 to observe step size decay
mini_batch_size = 1  # Given value is 128, but I've set to 1 to run quickly on toy data
momentum = 0.9  # Given
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"  #alphabet set, given
alph_size = len(alphabet)
step_size = init_step_size
data = data_handling.read_data(filename, alphabet, seq_len, num_classes)
x = data[0]  # Training input character sequences
y = data[1]  # Training input labels


#Function to implement step size decay (halves every 3 epochs, 10 times)
def step_size_decay(epoch):
    if epoch > 1 and epoch <= 30 and epoch % 3 == 1:
        global step_size
        step_size = step_size / 2
    return step_size


#Function to print epoch count, loss and step size (to observe decay) after every epoch
class FlushCallback(Callback):
    def on_epoch_end(self, epoch, logs={}):
Exemple #7
0
mini_batch_size = 256
learning_rate = 0.001
standard_deviation = 0.01
initial_bias = 0.01
loss = "SCE"  # There are two types of loss: L2 or SCE i.e. Sigmoid Cross Entropy
verbosity_level = False

network_structure_list = [
    dh.MNIST_WIDTH * dh.MNIST_HEIGHT,  # Input layer size
    32,
    dh.MNIST_WIDTH * dh.MNIST_HEIGHT  # Output layer size
]

# Read input data
file_name = sys.argv[1]
data = dh.read_data(file_name)
random.shuffle(data)

N = len(data)
nvd = int(0.1 * N)
print("Number of images: " + str(N))
print("Number of validation images: " + str(nvd))
print("Number of training data: " + str(N - nvd))

training_data = data[nvd:]
validation_data = data[0:nvd]

# Setup the network
ae_one = ae.AutoEncoder(dh.MNIST_WIDTH, dh.MNIST_HEIGHT,
                        network_structure_list, standard_deviation,
                        initial_bias, loss, verbosity_level)
    plt.savefig("KAlleF1_training_E_{}_B_{}.png".format(epochs, batches))
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 10))
    sbr.heatmap(test_accuracy, annot=True, ax=ax, cmap="viridis")
    ax.set_title("Test Accuracy")
    ax.set_xlabel("Neurons per layer")
    ax.set_ylabel("Hidden layers")
    plt.yticks(ticks=np.arange(len(hidden_layers)), labels=hidden_layers)
    plt.xticks(ticks=np.arange(len(neurons_pr_layer)), labels=neurons_pr_layer)
    plt.savefig("KAlleF1_test_E_{}_B_{}.png".format(epochs, batches))
    plt.show()


if __name__ == '__main__':
    df = read_data()
    features = df.loc[:, df.columns != 'defaultPaymentNextMonth'].values
    targets = df.loc[:, df.columns == 'defaultPaymentNextMonth'].values

    #cnn = CondensedNearestNeighbour(random_state=1337)
    #print(f"{features.shape} skalle {np.reshape(targets, (len(targets), )).shape}")
    #features, targets = cnn.fit_resample(features, np.reshape(targets, (len(targets),)))
    #print(f"{features.shape} skalle {targets.shape}")

    sm = SMOTE(random_state=42)
    features, targets = sm.fit_resample(features, targets)

    data_train, data_test, targets_train, targets_test = train_test_split(
        features, targets, test_size=0.2, shuffle=True)

    #sm = SMOTE(random_state=42)
Exemple #9
0
import numpy as np

import data_handling as dh
import deep_models as dm

#random.seed(1)

if __name__ == "__main__":
    print("Starting main.")

    # ------------------------------------------------------------------------
    # Load data
    # ------------------------------------------------------------------------

    train_images, truth_images = dh.read_data(data_dir="training/images/")
    iw, ih, ic = train_images[0].shape

    if len(train_images) != len(truth_images):
        sys.exit("ERROR: Dimension mismatch.")
    n_images = len(train_images)

    print("Data loaded.")
    print("Number of train images:" + str(len(train_images)))
    print("Number of truth images:" + str(len(truth_images)))
    print("Train image size: " + str(train_images[0].shape))
    print("Truth image size: " + str(truth_images[0].shape))

    # ------------------------------------------------------------------------
    # Augment data
    # ------------------------------------------------------------------------