def get_split_data(air_files,
                   train_ratio=.85,
                   val_ratio=.075,
                   test_ratio=.075,
                   stratified=True,
                   framesize=None,
                   print_set_report=True,
                   clustered_data=True,
                   **kwargs):
    import numpy as np

    from myutils_reverb import read_li8_file

    val_sum = train_ratio + test_ratio + val_ratio

    if not isclose(val_sum, 1.):
        raise AssertionError('Ratios should sum to 1.00 and not ' +
                             str(val_sum))
    (x, y), ids, class_names = read_air_and_filters_xy(air_files,
                                                       framesize=framesize,
                                                       **kwargs)

    if stratified:
        from sklearn.model_selection import StratifiedShuffleSplit as splitter

        y_new = np.zeros((y.shape[0], )).astype('int64')
        uvals = np.unique(y, axis=0)

        for i in range(uvals.shape[0]):
            y_new[np.all(y == uvals[i, :], axis=1)] = i
    else:
        from sklearn.model_selection import ShuffleSplit as splitter
        y_new = y

    sss = splitter(n_splits=1, test_size=test_ratio, random_state=50)

    for train_val_index, test_index in sss.split(np.zeros_like(y_new), y_new):
        pass

    sss_val = splitter(n_splits=1,
                       test_size=(val_ratio) / (val_ratio + train_ratio),
                       random_state=50)
    for train_index, val_index in sss_val.split(
            np.zeros_like(y_new[train_val_index]), y_new[train_val_index]):
        pass

    train_index = train_val_index[train_index]

    val_index = train_val_index[val_index]

    if print_set_report:
        print_split_report(y,
                           idx_list=(train_index, val_index, test_index),
                           set_names=('Train', 'Val', 'Test'))

    return (x[train_index, :], y[train_index, :]), ids[train_index], \
           (x[test_index, :], y[test_index, :]), ids[test_index], \
           (x[val_index, :], y[val_index, :]), ids[val_index], \
           (x, y), ids, \
           class_names
def train_and_test(dataset, data):
    for column in data.columns:
        if data[column].dtype == type(object):
            le = LabelEncoder()
            data[column] = le.fit_transform(data[column])
    y = data.result
    x = data.drop('result', axis=1)
    profile = cProfile.Profile()
    x_train, x_test, y_train, y_test = splitter(x, y, test_size=0.3)
    profile.enable()
    # train and test
    regressor = RandomForestClassifier()
    regressor.fit(x_train, y_train)
    y_pred = regressor.predict(x_test)
    profile.disable()
    profile.dump_stats('output.prof')
    stream = open('result/' + dataset + '_profiling.txt', 'w')
    stats = pstats.Stats('output.prof', stream=stream)
    stats.sort_stats('cumtime')
    stats.print_stats()
    os.remove('output.prof')
    conf_matrix = confusion_matrix(y_test, y_pred)
    f = open('result/' + dataset + '_output.txt', 'w')
    sys.stdout = f
    print(conf_matrix)
    print(classification_report(y_test, y_pred))
Ejemplo n.º 3
0
def train_and_test(dataset, data):
    for column in data.columns:
        if data[column].dtype == type(object):
            le = LabelEncoder()
            data[column] = le.fit_transform(data[column])
    y = data.result
    x = data.drop('result', axis=1)
    profile = cProfile.Profile()
    x_train, x_test, y_train, y_test = splitter(x, y, test_size=0.3)
    profile.enable()
    # train and test
    model = XGBClassifier(objective='multi:softprob', booster='gbtree', verbosity=0)
    model.fit(x_train, y_train, eval_metric='mlogloss')
    y_pred = model.predict(x_test)
    profile.disable()
    profile.dump_stats('output.prof')
    stream = open('result/' + dataset + '_profiling.txt', 'w')
    stats = pstats.Stats('output.prof', stream=stream)
    stats.sort_stats('cumtime')
    stats.print_stats()
    os.remove('output.prof')
    conf_matrix = confusion_matrix(y_test, y_pred)
    f = open('result/' + dataset + '_output.txt', 'w')
    sys.stdout = f
    print(conf_matrix)
    print(classification_report(y_test, y_pred))
Ejemplo n.º 4
0
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = ColumnTransformer(
    [("Geography", OneHotEncoder(), [1])
     ],  # The column numbers to be transformed (here is [1])
    remainder="passthrough")  # Leave the rest of the columns untouched
X = onehotencoder.fit_transform(X)
X = X[:, 1:]
#dividir dataset en conjunto de entrenamiento y testing

from sklearn.model_selection import train_test_split as splitter

x_train, x_test, y_train, y_test = splitter(X,
                                            y,
                                            test_size=0.2,
                                            random_state=0)

#Escalado de variables
from sklearn.preprocessing import StandardScaler as scaler

scala_x = scaler()
x_train = scala_x.fit_transform(x_train)
x_test = scala_x.transform(x_test)

#------<AJUSTAR MODELOS DE CLASIFICACION>----------
"""
#Ajustar regresion con el el conjunto de entrenamiento

#Crear modelo de clasificación aqui 
df = pd.read_csv("D:/FYS-STK4155/train.csv")
#df.info()

X = df[[
    "femaleres", "age", "married", "children", "edu", "ent_wagelabor",
    "durable_investment", "fs_adskipm_often"
]]
y = df["depressed"]

inputs = X  #Feature matrix of 569 rows (samples) and 30 columns (parameters)
outputs = y  #Label array of 569 rows (0 for benign and 1 for malignant)

x = inputs  #Reassign the Feature and Label matrices to other variables
y = outputs

X_train, X_test, y_train, y_test = splitter(
    X, y, test_size=0.1)  #Split datasets into training and testing

y_train = to_categorical(
    y_train
)  #Convert labels to categorical when using categorical cross entropy
y_test = to_categorical(y_test)

#del temp1,temp2,temp

# %%

# Define tunable parameters"

eta = np.logspace(
    -3, -1, 3)  #Define vector of learning rates (parameter to SGD optimiser)
lamda = 0.01  #Define hyperparameter
def train_and_test(dataset, data):
    for column in data.columns:
        if data[column].dtype == type(object):
            le = LabelEncoder()
            data[column] = le.fit_transform(data[column])
    y = data.result
    x = data.drop('result', axis=1)
    profile = cProfile.Profile()
    x_train, x_test, y_train, y_test = splitter(x, y, test_size=0.3)
    profile.enable()
    y_train = to_categorical(y_train)
    y_test = to_categorical(y_test)
    val_indices = 200
    x_val = x_train[-val_indices:]
    y_val = y_train[-val_indices:]
    # train and test
    model = Sequential()
    model.add(
        Dense(1024,
              activation='relu',
              input_dim=x_train.shape[1],
              kernel_regularizer=regularizers.l2(0.001)))
    model.add(
        Dense(1024,
              activation='relu',
              kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dropout(0.5))
    model.add(
        Dense(1024,
              activation='relu',
              kernel_regularizer=regularizers.l2(0.001)))
    model.add(
        Dense(1024,
              activation='relu',
              kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dropout(0.5))
    model.add(
        Dense(1024,
              activation='relu',
              kernel_regularizer=regularizers.l2(0.001)))
    model.add(
        Dense(1024,
              activation='relu',
              kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dropout(0.5))
    model.add(
        Dense(512,
              activation='relu',
              kernel_regularizer=regularizers.l2(0.001)))
    model.add(
        Dense(512,
              activation='relu',
              kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dense(5, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    model.fit(x_train,
              y_train,
              epochs=15,
              batch_size=512,
              validation_data=(x_val, y_val))
    y_pred = model.predict(x_test)
    profile.disable()
    y_pred = np.argmax(y_pred, axis=1)
    y_test = np.argmax(y_test, axis=1)
    profile.dump_stats('output.prof')
    stream = open('result/' + dataset + '_profiling.txt', 'w')
    stats = pstats.Stats('output.prof', stream=stream)
    stats.sort_stats('cumtime')
    stats.print_stats()
    os.remove('output.prof')
    conf_matrix = confusion_matrix(y_test, y_pred)
    f = open('result/' + dataset + '_output.txt', 'w')
    sys.stdout = f
    print(conf_matrix)
    print(classification_report(y_test, y_pred))
Ejemplo n.º 7
0
#Load dataset
cancer = ds.load_breast_cancer()

print('sample # = {}'.format(len(cancer.data)))
print('target # = {}'.format(len(cancer.target)))
print('shape = {}'.format(cancer.data.shape))

#print("Features: {}, feature dim # = {}".format(cancer.feature_names, len(cancer.feature_names)))
#print('some data ex = {}, feature dim # = {}'.format(cancer.data[0:5], len(cancer.data[0])))

#print("Labels: ", cancer.target_names)

# 70% training and 30% tes
X_train, X_test, y_train, y_test = splitter(cancer.data,
                                            cancer.target,
                                            test_size=0.4,
                                            random_state=6265456)
#print(len(X_train))
#print(len(Y_train))
#print(len(X_test))
#print(len(Y_test))

# linear kernel
clf = svm.SVC(kernel='linear')

# train
print('training...')
clf.fit(X_train, y_train)

# test
y_pred = clf.predict(X_test)
MSE = np.zeros((order.shape[0], 2))
r2 = np.zeros((order.shape[0], 2))
bias = np.zeros((order.shape[0], 2))
variance = np.zeros((order.shape[0], 2))
error = np.zeros((order.shape[0], 2))

#%%
"""Split data into training and testing data and perform regression with bootstrap"""

n_bootstrap = 500

for i in range(len(order)):
    X = DesMatrix(order[i], x, y)
    X_train, X_test, z_train, z_test = splitter(X,
                                                z,
                                                test_size=0.2,
                                                random_state=12)
    z_pred_train = np.zeros((len(z_train), n_bootstrap))
    z_pred_test = np.zeros((len(z_test), n_bootstrap))
    for j in range(n_bootstrap):
        X_, z_ = resample(X_train, z_train)
        z_pred_train[:, j] = OLS(X_, X_train, X_test, z_)[0].flatten()
        z_pred_test[:, j] = OLS(X_, X_train, X_test, z_)[1].flatten()
    MSE[i, 0] = metrics1(z_train, np.mean(z_pred_train, axis=1,
                                          keepdims=True))[0]
    MSE[i, 1] = metrics1(z_test, np.mean(z_pred_test, axis=1,
                                         keepdims=True))[0]
    r2[i, 0] = metrics1(z_train, np.mean(z_pred_train, axis=1,
                                         keepdims=True))[1]
    r2[i, 1] = metrics1(z_test, np.mean(z_pred_test, axis=1, keepdims=True))[1]
    error[i, 0] = metrics2(z_train, z_pred_train)[0]
Ejemplo n.º 9
0
#%%
"""Declare arrays to store MSE and r2-score and create empty list for parameter variances (no scaling)"""

MSE_unsc = np.zeros(
    (order.shape[0], 2))  #initialise arrays to store MSE of unscaled data
r2_unsc = np.zeros(
    (order.shape[0], 2))  #initialise arrays to store r2 of unscaled data
var_list_unsc = []  #initialise list to store parameter variances

#%%
"""Split data into training and testing data and perform regression (without scaling)"""

for i in range(len(order)):  #loop over model complexity
    X = DesMatrix(order[i], x, y)  #construct design matrix
    X_train, X_test, z_train, z_test = splitter(
        X, z, test_size=0.2,
        random_state=12)  #split data into 80% train and 20% test
    z_pred = OLS(X_train, X_test, z_train)  #perform OLS fitting
    var_list_unsc.append(var_beta(X_train))  #update parameter variance list
    MSE_unsc[i, 0] = metrics(
        z_train, z_pred[0])[0]  #calculate MSE of unscaled training data
    MSE_unsc[i, 1] = metrics(
        z_test, z_pred[1])[0]  #calculate MSE of unscaled testing data
    r2_unsc[i,
            0] = metrics(z_train,
                         z_pred[0])[1]  #calculate r2 of unscaled training data
    r2_unsc[i,
            1] = metrics(z_test,
                         z_pred[1])[1]  #calculate r2 of unscaled testing data

#%%
Ejemplo n.º 10
0
    print('Imputation for column %s (%i of %i)' %
          (cc, ii + 1, len(cn_partial)))
    tmp_y = X_df[cc].copy()
    tmp_idx = np.where(tmp_y.notnull())[0]
    tmp_idx_null = np.where(tmp_y.isnull())[0]
    tmp_tt = tmp_y.dtype
    if tmp_tt == 'object':
        li = LabelEncoder().fit(tmp_y[tmp_y.notnull()])
        tmp_y[tmp_y.notnull()] = li.transform(tmp_y[tmp_y.notnull()])
        tmp_mf = af.auc
        tmp_method = 'bernoulli'
    else:
        tmp_mf = metrics.r2_score
        tmp_method = 'gaussian'
    # Split data
    train_idx, test_idx = splitter(tmp_idx, test_size=0.1, random_state=1234)
    X_train_ii, X_test_ii = X_df.loc[train_idx, cn_complete].reset_index(drop=True), \
                            X_df.loc[test_idx, cn_complete].reset_index(drop=True)
    y_train_ii, y_test_ii = tmp_y[train_idx].values, tmp_y[test_idx].values

    # Fit model
    mdl_ii = mf.mbatch_NB(method=tmp_method)
    mdl_ii.fit(data=X_train_ii, lbls=y_train_ii.astype(int), mbatch=100000)
    score_train_ii = mdl_ii.predict(X_train_ii)
    score_test_ii = mdl_ii.predict(X_test_ii)
    score_impute_ii = mdl_ii.predict(X_df.loc[tmp_idx_null, cn_complete])

    if tmp_tt == 'object':
        score_test_ii = score_test_ii[:, 1]  # drop the first class
        score_impute_ii = score_impute_ii[:, 1]
        cal_ii = af.plot_auc(y_test_ii, score_test_ii, num=250, figure=False)
Ejemplo n.º 11
0
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#importar datase
df = pd.read_csv("Salary_Data.csv")

x = df.iloc[:, :-1].values
y = df.iloc[:, 1].values

#dividir dataset en conjunto de entrenamiento y testing
from sklearn.model_selection import train_test_split as splitter

x_train, x_test, y_train, y_test = splitter(x,
                                            y,
                                            test_size=1 / 3,
                                            random_state=0)
x_train = x_train.reshape(-1, 1)
#x_test=x_test.reshape(-1,1)

#Escalado de variables (generalmente nose usa en regresión lineal)

#Modelo de Regresión de entrenamiento
from sklearn.linear_model import LinearRegression

regression = LinearRegression()
regression.fit(x_train, y_train)

#predecir el conjunto de test

y_pred = regression.predict(x_test)
Ejemplo n.º 12
0
print("\niris_dataset's key \n", iris_dataset.keys())
print("\niris_dataset's DESCR \n", iris_dataset['DESCR'][:200])
print("\niris_dataset's target_names \n", iris_dataset['target_names'])
print("\niris_dataset's feature names \n", iris_dataset['feature_names'])

print("\niris_dataset's data types \n", type(iris_dataset['data']))
print("\niris_dataset's data shape \n", iris_dataset['data'].shape)
print("\niris_dataset's first five rows \n", iris_dataset['data'][:5])

print("\niris_dataset's target types \n", type(iris_dataset['target']))
print("\niris_dataset's target shape \n", iris_dataset['target'].shape)
print("\niris_dataset's targets \n", iris_dataset['target'])

from sklearn.model_selection import train_test_split as splitter
train_x, test_x, train_y, test_y = splitter(iris_dataset['data'],
                                            iris_dataset['target'],
                                            random_state=0)

print("\ntrain_x / train_y\n", train_x.shape, " / ", train_y.shape)
print("\ntest_x / test_y\n", test_x.shape, " / ", test_y.shape)

# scatter matrix of iris_dataset by pandas
iris_dataframe = pd.DataFrame(train_x, columns=iris_dataset.feature_names)
pd.plotting.scatter_matrix(iris_dataframe,
                           c=train_y,
                           figsize=(15, 15),
                           marker='o',
                           hist_kwds={'bins': 20},
                           s=60,
                           alpha=.8)
Ejemplo n.º 13
0
    target.append(str_temp[-1])
    temp = []
    for item in str_temp[0:-1]:
        temp.append(int(item) / 255)
    data.append(temp)
print("Training data totally have " + str(len(data)) + " pieces.")
label_dict = {}
for item in target:
    if not label_dict.keys().__contains__(item):
        label_dict[item] = 0
    label_dict[item] += 1
print("Training labels totally have " + str(len(label_dict.keys())) +
      " classes.")
print("Training model..........................")
X_train, X_test, Y_train, Y_test = splitter(data,
                                            target,
                                            test_size=0.2,
                                            random_state=30)
model = MLPClassifier([256, 256],
                      learning_rate_init=0.001,
                      activation='relu',
                      solver='adam',
                      alpha=0.0001,
                      max_iter=10000)  # 神经网络
print('start training..........................')
model.fit(data, target)
print('end training............................')
joblib.dump(model, "SimpleClassifier.pkl")
train_accuracy = model.score(X_train, Y_train)
test_accuracy = model.score(X_test, Y_test)
print("Classifier has %.4f percent accuracy on train set." % train_accuracy *
      100)