def get_split_data(air_files, train_ratio=.85, val_ratio=.075, test_ratio=.075, stratified=True, framesize=None, print_set_report=True, clustered_data=True, **kwargs): import numpy as np from myutils_reverb import read_li8_file val_sum = train_ratio + test_ratio + val_ratio if not isclose(val_sum, 1.): raise AssertionError('Ratios should sum to 1.00 and not ' + str(val_sum)) (x, y), ids, class_names = read_air_and_filters_xy(air_files, framesize=framesize, **kwargs) if stratified: from sklearn.model_selection import StratifiedShuffleSplit as splitter y_new = np.zeros((y.shape[0], )).astype('int64') uvals = np.unique(y, axis=0) for i in range(uvals.shape[0]): y_new[np.all(y == uvals[i, :], axis=1)] = i else: from sklearn.model_selection import ShuffleSplit as splitter y_new = y sss = splitter(n_splits=1, test_size=test_ratio, random_state=50) for train_val_index, test_index in sss.split(np.zeros_like(y_new), y_new): pass sss_val = splitter(n_splits=1, test_size=(val_ratio) / (val_ratio + train_ratio), random_state=50) for train_index, val_index in sss_val.split( np.zeros_like(y_new[train_val_index]), y_new[train_val_index]): pass train_index = train_val_index[train_index] val_index = train_val_index[val_index] if print_set_report: print_split_report(y, idx_list=(train_index, val_index, test_index), set_names=('Train', 'Val', 'Test')) return (x[train_index, :], y[train_index, :]), ids[train_index], \ (x[test_index, :], y[test_index, :]), ids[test_index], \ (x[val_index, :], y[val_index, :]), ids[val_index], \ (x, y), ids, \ class_names
def train_and_test(dataset, data): for column in data.columns: if data[column].dtype == type(object): le = LabelEncoder() data[column] = le.fit_transform(data[column]) y = data.result x = data.drop('result', axis=1) profile = cProfile.Profile() x_train, x_test, y_train, y_test = splitter(x, y, test_size=0.3) profile.enable() # train and test regressor = RandomForestClassifier() regressor.fit(x_train, y_train) y_pred = regressor.predict(x_test) profile.disable() profile.dump_stats('output.prof') stream = open('result/' + dataset + '_profiling.txt', 'w') stats = pstats.Stats('output.prof', stream=stream) stats.sort_stats('cumtime') stats.print_stats() os.remove('output.prof') conf_matrix = confusion_matrix(y_test, y_pred) f = open('result/' + dataset + '_output.txt', 'w') sys.stdout = f print(conf_matrix) print(classification_report(y_test, y_pred))
def train_and_test(dataset, data): for column in data.columns: if data[column].dtype == type(object): le = LabelEncoder() data[column] = le.fit_transform(data[column]) y = data.result x = data.drop('result', axis=1) profile = cProfile.Profile() x_train, x_test, y_train, y_test = splitter(x, y, test_size=0.3) profile.enable() # train and test model = XGBClassifier(objective='multi:softprob', booster='gbtree', verbosity=0) model.fit(x_train, y_train, eval_metric='mlogloss') y_pred = model.predict(x_test) profile.disable() profile.dump_stats('output.prof') stream = open('result/' + dataset + '_profiling.txt', 'w') stats = pstats.Stats('output.prof', stream=stream) stats.sort_stats('cumtime') stats.print_stats() os.remove('output.prof') conf_matrix = confusion_matrix(y_test, y_pred) f = open('result/' + dataset + '_output.txt', 'w') sys.stdout = f print(conf_matrix) print(classification_report(y_test, y_pred))
labelencoder_X_1 = LabelEncoder() X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1]) labelencoder_X_2 = LabelEncoder() X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2]) onehotencoder = ColumnTransformer( [("Geography", OneHotEncoder(), [1]) ], # The column numbers to be transformed (here is [1]) remainder="passthrough") # Leave the rest of the columns untouched X = onehotencoder.fit_transform(X) X = X[:, 1:] #dividir dataset en conjunto de entrenamiento y testing from sklearn.model_selection import train_test_split as splitter x_train, x_test, y_train, y_test = splitter(X, y, test_size=0.2, random_state=0) #Escalado de variables from sklearn.preprocessing import StandardScaler as scaler scala_x = scaler() x_train = scala_x.fit_transform(x_train) x_test = scala_x.transform(x_test) #------<AJUSTAR MODELOS DE CLASIFICACION>---------- """ #Ajustar regresion con el el conjunto de entrenamiento #Crear modelo de clasificación aqui
df = pd.read_csv("D:/FYS-STK4155/train.csv") #df.info() X = df[[ "femaleres", "age", "married", "children", "edu", "ent_wagelabor", "durable_investment", "fs_adskipm_often" ]] y = df["depressed"] inputs = X #Feature matrix of 569 rows (samples) and 30 columns (parameters) outputs = y #Label array of 569 rows (0 for benign and 1 for malignant) x = inputs #Reassign the Feature and Label matrices to other variables y = outputs X_train, X_test, y_train, y_test = splitter( X, y, test_size=0.1) #Split datasets into training and testing y_train = to_categorical( y_train ) #Convert labels to categorical when using categorical cross entropy y_test = to_categorical(y_test) #del temp1,temp2,temp # %% # Define tunable parameters" eta = np.logspace( -3, -1, 3) #Define vector of learning rates (parameter to SGD optimiser) lamda = 0.01 #Define hyperparameter
def train_and_test(dataset, data): for column in data.columns: if data[column].dtype == type(object): le = LabelEncoder() data[column] = le.fit_transform(data[column]) y = data.result x = data.drop('result', axis=1) profile = cProfile.Profile() x_train, x_test, y_train, y_test = splitter(x, y, test_size=0.3) profile.enable() y_train = to_categorical(y_train) y_test = to_categorical(y_test) val_indices = 200 x_val = x_train[-val_indices:] y_val = y_train[-val_indices:] # train and test model = Sequential() model.add( Dense(1024, activation='relu', input_dim=x_train.shape[1], kernel_regularizer=regularizers.l2(0.001))) model.add( Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.001))) model.add(Dropout(0.5)) model.add( Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.001))) model.add( Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.001))) model.add(Dropout(0.5)) model.add( Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.001))) model.add( Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.001))) model.add(Dropout(0.5)) model.add( Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.001))) model.add( Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.001))) model.add(Dense(5, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) model.fit(x_train, y_train, epochs=15, batch_size=512, validation_data=(x_val, y_val)) y_pred = model.predict(x_test) profile.disable() y_pred = np.argmax(y_pred, axis=1) y_test = np.argmax(y_test, axis=1) profile.dump_stats('output.prof') stream = open('result/' + dataset + '_profiling.txt', 'w') stats = pstats.Stats('output.prof', stream=stream) stats.sort_stats('cumtime') stats.print_stats() os.remove('output.prof') conf_matrix = confusion_matrix(y_test, y_pred) f = open('result/' + dataset + '_output.txt', 'w') sys.stdout = f print(conf_matrix) print(classification_report(y_test, y_pred))
#Load dataset cancer = ds.load_breast_cancer() print('sample # = {}'.format(len(cancer.data))) print('target # = {}'.format(len(cancer.target))) print('shape = {}'.format(cancer.data.shape)) #print("Features: {}, feature dim # = {}".format(cancer.feature_names, len(cancer.feature_names))) #print('some data ex = {}, feature dim # = {}'.format(cancer.data[0:5], len(cancer.data[0]))) #print("Labels: ", cancer.target_names) # 70% training and 30% tes X_train, X_test, y_train, y_test = splitter(cancer.data, cancer.target, test_size=0.4, random_state=6265456) #print(len(X_train)) #print(len(Y_train)) #print(len(X_test)) #print(len(Y_test)) # linear kernel clf = svm.SVC(kernel='linear') # train print('training...') clf.fit(X_train, y_train) # test y_pred = clf.predict(X_test)
MSE = np.zeros((order.shape[0], 2)) r2 = np.zeros((order.shape[0], 2)) bias = np.zeros((order.shape[0], 2)) variance = np.zeros((order.shape[0], 2)) error = np.zeros((order.shape[0], 2)) #%% """Split data into training and testing data and perform regression with bootstrap""" n_bootstrap = 500 for i in range(len(order)): X = DesMatrix(order[i], x, y) X_train, X_test, z_train, z_test = splitter(X, z, test_size=0.2, random_state=12) z_pred_train = np.zeros((len(z_train), n_bootstrap)) z_pred_test = np.zeros((len(z_test), n_bootstrap)) for j in range(n_bootstrap): X_, z_ = resample(X_train, z_train) z_pred_train[:, j] = OLS(X_, X_train, X_test, z_)[0].flatten() z_pred_test[:, j] = OLS(X_, X_train, X_test, z_)[1].flatten() MSE[i, 0] = metrics1(z_train, np.mean(z_pred_train, axis=1, keepdims=True))[0] MSE[i, 1] = metrics1(z_test, np.mean(z_pred_test, axis=1, keepdims=True))[0] r2[i, 0] = metrics1(z_train, np.mean(z_pred_train, axis=1, keepdims=True))[1] r2[i, 1] = metrics1(z_test, np.mean(z_pred_test, axis=1, keepdims=True))[1] error[i, 0] = metrics2(z_train, z_pred_train)[0]
#%% """Declare arrays to store MSE and r2-score and create empty list for parameter variances (no scaling)""" MSE_unsc = np.zeros( (order.shape[0], 2)) #initialise arrays to store MSE of unscaled data r2_unsc = np.zeros( (order.shape[0], 2)) #initialise arrays to store r2 of unscaled data var_list_unsc = [] #initialise list to store parameter variances #%% """Split data into training and testing data and perform regression (without scaling)""" for i in range(len(order)): #loop over model complexity X = DesMatrix(order[i], x, y) #construct design matrix X_train, X_test, z_train, z_test = splitter( X, z, test_size=0.2, random_state=12) #split data into 80% train and 20% test z_pred = OLS(X_train, X_test, z_train) #perform OLS fitting var_list_unsc.append(var_beta(X_train)) #update parameter variance list MSE_unsc[i, 0] = metrics( z_train, z_pred[0])[0] #calculate MSE of unscaled training data MSE_unsc[i, 1] = metrics( z_test, z_pred[1])[0] #calculate MSE of unscaled testing data r2_unsc[i, 0] = metrics(z_train, z_pred[0])[1] #calculate r2 of unscaled training data r2_unsc[i, 1] = metrics(z_test, z_pred[1])[1] #calculate r2 of unscaled testing data #%%
print('Imputation for column %s (%i of %i)' % (cc, ii + 1, len(cn_partial))) tmp_y = X_df[cc].copy() tmp_idx = np.where(tmp_y.notnull())[0] tmp_idx_null = np.where(tmp_y.isnull())[0] tmp_tt = tmp_y.dtype if tmp_tt == 'object': li = LabelEncoder().fit(tmp_y[tmp_y.notnull()]) tmp_y[tmp_y.notnull()] = li.transform(tmp_y[tmp_y.notnull()]) tmp_mf = af.auc tmp_method = 'bernoulli' else: tmp_mf = metrics.r2_score tmp_method = 'gaussian' # Split data train_idx, test_idx = splitter(tmp_idx, test_size=0.1, random_state=1234) X_train_ii, X_test_ii = X_df.loc[train_idx, cn_complete].reset_index(drop=True), \ X_df.loc[test_idx, cn_complete].reset_index(drop=True) y_train_ii, y_test_ii = tmp_y[train_idx].values, tmp_y[test_idx].values # Fit model mdl_ii = mf.mbatch_NB(method=tmp_method) mdl_ii.fit(data=X_train_ii, lbls=y_train_ii.astype(int), mbatch=100000) score_train_ii = mdl_ii.predict(X_train_ii) score_test_ii = mdl_ii.predict(X_test_ii) score_impute_ii = mdl_ii.predict(X_df.loc[tmp_idx_null, cn_complete]) if tmp_tt == 'object': score_test_ii = score_test_ii[:, 1] # drop the first class score_impute_ii = score_impute_ii[:, 1] cal_ii = af.plot_auc(y_test_ii, score_test_ii, num=250, figure=False)
import numpy as np import matplotlib.pyplot as plt import pandas as pd #importar datase df = pd.read_csv("Salary_Data.csv") x = df.iloc[:, :-1].values y = df.iloc[:, 1].values #dividir dataset en conjunto de entrenamiento y testing from sklearn.model_selection import train_test_split as splitter x_train, x_test, y_train, y_test = splitter(x, y, test_size=1 / 3, random_state=0) x_train = x_train.reshape(-1, 1) #x_test=x_test.reshape(-1,1) #Escalado de variables (generalmente nose usa en regresión lineal) #Modelo de Regresión de entrenamiento from sklearn.linear_model import LinearRegression regression = LinearRegression() regression.fit(x_train, y_train) #predecir el conjunto de test y_pred = regression.predict(x_test)
print("\niris_dataset's key \n", iris_dataset.keys()) print("\niris_dataset's DESCR \n", iris_dataset['DESCR'][:200]) print("\niris_dataset's target_names \n", iris_dataset['target_names']) print("\niris_dataset's feature names \n", iris_dataset['feature_names']) print("\niris_dataset's data types \n", type(iris_dataset['data'])) print("\niris_dataset's data shape \n", iris_dataset['data'].shape) print("\niris_dataset's first five rows \n", iris_dataset['data'][:5]) print("\niris_dataset's target types \n", type(iris_dataset['target'])) print("\niris_dataset's target shape \n", iris_dataset['target'].shape) print("\niris_dataset's targets \n", iris_dataset['target']) from sklearn.model_selection import train_test_split as splitter train_x, test_x, train_y, test_y = splitter(iris_dataset['data'], iris_dataset['target'], random_state=0) print("\ntrain_x / train_y\n", train_x.shape, " / ", train_y.shape) print("\ntest_x / test_y\n", test_x.shape, " / ", test_y.shape) # scatter matrix of iris_dataset by pandas iris_dataframe = pd.DataFrame(train_x, columns=iris_dataset.feature_names) pd.plotting.scatter_matrix(iris_dataframe, c=train_y, figsize=(15, 15), marker='o', hist_kwds={'bins': 20}, s=60, alpha=.8)
target.append(str_temp[-1]) temp = [] for item in str_temp[0:-1]: temp.append(int(item) / 255) data.append(temp) print("Training data totally have " + str(len(data)) + " pieces.") label_dict = {} for item in target: if not label_dict.keys().__contains__(item): label_dict[item] = 0 label_dict[item] += 1 print("Training labels totally have " + str(len(label_dict.keys())) + " classes.") print("Training model..........................") X_train, X_test, Y_train, Y_test = splitter(data, target, test_size=0.2, random_state=30) model = MLPClassifier([256, 256], learning_rate_init=0.001, activation='relu', solver='adam', alpha=0.0001, max_iter=10000) # 神经网络 print('start training..........................') model.fit(data, target) print('end training............................') joblib.dump(model, "SimpleClassifier.pkl") train_accuracy = model.score(X_train, Y_train) test_accuracy = model.score(X_test, Y_test) print("Classifier has %.4f percent accuracy on train set." % train_accuracy * 100)