def main(args, config): # Set seed and import packages # NOTE: This need to be done before any keras module is imported! logger.debug("Import packages and set random seed to %s.", int(config["seed"])) import numpy as np np.random.seed(int(config["seed"])) import ROOT ROOT.PyConfig.IgnoreCommandLineOptions = True # disable ROOT internal argument parser import root_numpy from sklearn import preprocessing, model_selection import keras_models from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint # Extract list of variables variables = config["variables"] logger.debug("Use variables:") for v in variables: logger.debug("%s", v) # Load training dataset filename = config["datasets"][args.fold] logger.debug("Load training dataset from %s.", filename) x = [] y = [] w = [] rfile = ROOT.TFile(filename, "READ") classes = config["classes"] for i_class, class_ in enumerate(classes): logger.debug("Process class %s.", class_) tree = rfile.Get(class_) if tree == None: logger.fatal("Tree %s not found in file %s.", class_, filename) raise Exception # Get inputs for this class x_class = np.zeros((tree.GetEntries(), len(variables))) x_conv = root_numpy.tree2array(tree, branches=variables) for i_var, var in enumerate(variables): x_class[:, i_var] = x_conv[var] x.append(x_class) # Get weights w_class = np.zeros((tree.GetEntries(), 1)) w_conv = root_numpy.tree2array(tree, branches=[config["event_weights"]]) w_class[:, 0] = w_conv[ config["event_weights"]] * config["class_weights"][class_] w.append(w_class) # Get targets for this class y_class = np.zeros((tree.GetEntries(), len(classes))) y_class[:, i_class] = np.ones((tree.GetEntries())) y.append(y_class) # Stack inputs, targets and weights to a Keras-readable dataset x = np.vstack(x) # inputs y = np.vstack(y) # targets w = np.vstack(w) * config["global_weight_scale"] # weights w = np.squeeze(w) # needed to get weights into keras # Perform input variable transformation and pickle scaler object logger.info("Use preprocessing method %s.", config["preprocessing"]) if "standard_scaler" in config["preprocessing"]: scaler = preprocessing.StandardScaler().fit(x) for var, mean, std in zip(variables, scaler.mean_, scaler.scale_): logger.debug("Preprocessing (variable, mean, std): %s, %s, %s", var, mean, std) elif "identity" in config["preprocessing"]: scaler = preprocessing.StandardScaler().fit(x) for i in range(len(scaler.mean_)): scaler.mean_[i] = 0.0 scaler.scale_[i] = 1.0 for var, mean, std in zip(variables, scaler.mean_, scaler.scale_): logger.debug("Preprocessing (variable, mean, std): %s, %s, %s", var, mean, std) elif "robust_scaler" in config["preprocessing"]: scaler = preprocessing.RobustScaler().fit(x) for var, mean, std in zip(variables, scaler.center_, scaler.scale_): logger.debug("Preprocessing (variable, mean, std): %s, %s, %s", var, mean, std) elif "min_max_scaler" in config["preprocessing"]: scaler = preprocessing.MinMaxScaler(feature_range=(-1.0, 1.0)).fit(x) for var, min_, max_ in zip(variables, scaler.data_min_, scaler.data_max_): logger.debug("Preprocessing (variable, min, max): %s, %s, %s", var, min_, max_) elif "quantile_transformer" in config["preprocessing"]: scaler = preprocessing.QuantileTransformer( output_distribution="normal", random_state=int(config["seed"])).fit(x) else: logger.fatal("Preprocessing %s is not implemented.", config["preprocessing"]) raise Exception x = scaler.transform(x) path_preprocessing = os.path.join( config["output_path"], "fold{}_keras_preprocessing.pickle".format(args.fold)) logger.info("Write preprocessing object to %s.", path_preprocessing) pickle.dump(scaler, open(path_preprocessing, 'wb')) # Split data in training and testing x_train, x_test, y_train, y_test, w_train, w_test = model_selection.train_test_split( x, y, w, test_size=1.0 - config["train_test_split"], random_state=int(config["seed"])) # Add callbacks callbacks = [] if "early_stopping" in config["model"]: logger.info("Stop early after %s tries.", config["model"]["early_stopping"]) callbacks.append( EarlyStopping(patience=config["model"]["early_stopping"])) path_model = os.path.join(config["output_path"], "fold{}_keras_model.h5".format(args.fold)) if "save_best_only" in config["model"]: if config["model"]["save_best_only"]: logger.info("Write best model to %s.", path_model) callbacks.append( ModelCheckpoint(path_model, save_best_only=True, verbose=1)) if "reduce_lr_on_plateau" in config["model"]: logger.info("Reduce learning-rate after %s tries.", config["model"]["reduce_lr_on_plateau"]) callbacks.append( ReduceLROnPlateau(patience=config["model"]["reduce_lr_on_plateau"], verbose=1)) # Train model if not hasattr(keras_models, config["model"]["name"]): logger.fatal("Model %s is not implemented.", config["model"]["name"]) raise Exception logger.info("Train keras model %s.", config["model"]["name"]) if config["model"]["batch_size"] < 0: batch_size = x_train.shape[0] else: batch_size = config["model"]["batch_size"] model_impl = getattr(keras_models, config["model"]["name"]) model = model_impl(len(variables), len(classes)) model.summary() fitResults = model.fit(x_train, y_train, sample_weight=w_train, validation_data=(x_test, y_test, w_test), batch_size=batch_size, nb_epoch=config["model"]["epochs"], shuffle=True, callbacks=callbacks) # Save model if not "save_best_only" in config["model"]: logger.info("Write model to %s.", path_model) model.save(path_model) with open(path_model[:-3] + "_history.pkl", "wb") as file: pickle.dump(fitResults.history, file)
def validationLargeFile(filename, testFilename, cv=5): """ This function evaluates the performance of all possible algorithms to predict target variable in the next year using train-validate-test validation and mean absolute value. This function is used in iteration 2. ** This function is hard coded. Please be careful while editing. ** Parameters ---------- filename : string The name of stacked file used as training data in iteration 2. testFilename : string The name of file used as test data in iteration 2. cv : int The number of fold for cross validation. # is not used in this function Return Value ---------- None, but the results are saved as files instead. """ # filename = '2006-2013_FilteredColsTargetMissingBlank.csv' header = getHeader(filename) startTargetIndex, startPredictorIndex, numCols = getTargetAndPredictorIndex(header) numTargets = startPredictorIndex - startTargetIndex numPredictors = numCols - startPredictorIndex predictorHeader = header[startPredictorIndex:] targetHeader = header[startTargetIndex:startPredictorIndex] targetIDList = [int(head[0:4]) for head in targetHeader] collectBest = [['targetIdx', 'rankingMethod', 'algorithm', 'numFeatures', 'score', 'sd', 'timeProcessed']] dataset = np.genfromtxt(filename, delimiter=",", skip_header=1, autostrip=True, missing_values=np.nan, usecols=tuple(range(startTargetIndex,numCols))) testset = np.genfromtxt(testFilename, delimiter=",", skip_header=1, autostrip=True, missing_values=np.nan, usecols=tuple(range(startTargetIndex,numCols))) for targetIdx in range(0,numTargets): # Training Data --------------------------------------------------- X = dataset[:,tuple(range(numTargets,dataset.shape[1]))] y = dataset[:,targetIdx] keepRows = np.invert(np.isnan(y)) X = X[keepRows,:] y = y[keepRows] y = y.reshape(-1,1) Xscaler = preprocessing.StandardScaler().fit(X) Xscaler.transform(X) # Test Data --------------------------------------------------- XTest = testset[:,tuple(range(numTargets,testset.shape[1]))] yTest = testset[:,targetIdx] keepRowsTest = np.invert(np.isnan(yTest)) XTest = XTest[keepRowsTest,:] yTest = yTest[keepRowsTest] yTest = yTest.reshape(-1,1) Xscaler.transform(XTest) Yscaler = preprocessing.StandardScaler().fit(np.concatenate((y,yTest))) # Yscaler.transform(y) algos = ['SVL','RBF', 'LAS', 'RID', 'ELA', 'MLP','ML1','ML2','ML3','ML4','ML5','ML6','ML7','ML8','ML9'] print 'Target %d => Mean %.5f , STD %.5f, Min %.5f, Max %.5f' % (targetIdx, Yscaler.mean_, Yscaler.scale_, np.concatenate((y,yTest)).min(), np.concatenate((y,yTest)).max()) Yscaler = preprocessing.StandardScaler().fit(y) # continue rows = [] for rankingMethod in [0,1,2,3]: # numFeaturesTest = [5] numFeaturesTest = range(1,51) best = [targetIdx, rankingMethod, None,None,None,None,None] scoreTable = [numFeaturesTest] timeTable = [numFeaturesTest] sdTable = [numFeaturesTest] for algorithm in algos: scoreList = [] timeList = [] sdList = [] for numFeatures in numFeaturesTest: estimator = getEstimator(algorithm,Yscaler,numFeatures) if estimator is None: return algorithm + ': Wrong Algorithm' XIndex = getFeaturesIndex(predictorHeader,'Feature Selection 2006-2013',targetIDList[targetIdx],rankingMethod,numFeatures) Xready = X[:,tuple(XIndex)] y = np.ravel(y) startTime = time.time() # estimator.fit(Xready,y) # XTestReady = XTest[:,tuple(XIndex)] # predicted = estimator.predict(XTestReady) # absolute_error = np.absolute(yTest - predicted) firstTestIndex = int(0.75*Xready.shape[0]) estimator.fit(Xready[0:firstTestIndex,:],y[0:firstTestIndex]) predicted = estimator.predict(Xready[firstTestIndex:,:]) absolute_error = np.absolute(y[firstTestIndex:] - predicted) score = absolute_error.mean() sd = absolute_error.std() timeProcessed = time.time()-startTime scoreList.append(score) sdList.append(sd) timeList.append(timeProcessed) # print targetIdx, rankingMethod, algorithm, numFeatures, score, sd, timeProcessed if best[2] is None: best = [targetIdx, rankingMethod, algorithm, numFeatures, score, sd, timeProcessed] else: if score < best[4]: best = [targetIdx, rankingMethod, algorithm, numFeatures, score, sd, timeProcessed] # break scoreTable.append(scoreList) timeTable.append(timeList) sdTable.append(sdList) print 'target %d, rankingMethod %d' % (targetIdx, rankingMethod) print best collectBest.append(best) scoreTable = np.transpose(np.array(scoreTable)).tolist() timeTable = np.transpose(np.array(timeTable)).tolist() sdTable = np.transpose(np.array(sdTable)).tolist() rows.append(['score of ranking method = '+str(rankingMethod)]+algos) rows.extend(scoreTable) rows.append(['sd of ranking method = '+str(rankingMethod)]+algos) rows.extend(sdTable) rows.append(['time of ranking method = '+str(rankingMethod)]+algos) rows.extend(timeTable) rows.append((1+len(algos))*['']) writeCSV('./Validation 2006-2012 Three Set/Indicator'+str(targetIDList[targetIdx])+'-'+time.strftime("%Y-%m-%d-%H-%M-%S")+'-'.join(algos)+'.csv',rows) writeCSV('./Validation 2006-2012 Three Set/CollectBest'+'-'+time.strftime("%Y-%m-%d-%H-%M-%S")+'.csv',collectBest)
def testModel(testFilename, filename, targetIdx, rankingMethod, algorithm, numFeatures, gridSearch = False,cv=5): """ This function trains and tests the input model configurations with the test dataset. Parameters ---------- testFilename : string The name of file in iteration 2 used as testing data. filename : string The name of stacked file in iteration 2 used as training data. targetIdx : int The index of target variable in the target list. rankingMethod : int The number specified how predictors are ranked. (See the file of ranked predictor lists) algorithm : string (usually 3-character) The code of prediction algorithm such as 'RID' for ridge regression. numFeatures : int The number of features included in the model. gridSearch : boolean Perform parameter optimization or not. cv : int The number of fold for cross validation. Return Value ---------- None, but the results (error, sd) are printed. """ # filename = '2006-2013_FilteredColsTargetMissingBlank.csv' header = getHeader(filename) startTargetIndex, startPredictorIndex, numCols = getTargetAndPredictorIndex(header) numTargets = startPredictorIndex - startTargetIndex numPredictors = numCols - startPredictorIndex predictorHeader = header[startPredictorIndex:] targetHeader = header[startTargetIndex:startPredictorIndex] targetIDList = [int(head[0:4]) for head in targetHeader] dataset = np.genfromtxt(filename, delimiter=",", skip_header=1, autostrip=True, missing_values=np.nan, usecols=tuple(range(startTargetIndex,numCols))) # for targetIdx in range(0,numTargets): X = dataset[:,tuple(range(numTargets,dataset.shape[1]))] y = dataset[:,targetIdx] keepRows = np.invert(np.isnan(y)) X = X[keepRows,:] y = y[keepRows] y = y.reshape(-1,1) print X.shape Xscaler = preprocessing.StandardScaler().fit(X) Xscaler.transform(X) Yscaler = preprocessing.StandardScaler().fit(y) # Yscaler.transform(y) # print 'Target %d => Mean %.5f , STD %.5f' % (targetIdx, Yscaler.mean_, Yscaler.scale_) if not gridSearch: estimator = getEstimator(algorithm,Yscaler,numFeatures) else: estimator = getEstimatorGridSearch(algorithm,Yscaler,numFeatures) if estimator is None: return algorithm + ': Wrong Algorithm' XIndex = getFeaturesIndex(predictorHeader,'Feature Selection 2006-2013',targetIDList[targetIdx],rankingMethod,numFeatures) Xready = X[:,tuple(XIndex)] y = np.ravel(y) estimator.fit(Xready,y) # estimator.fit(Xready[0:int(0.75*Xready.shape[0]),:],y[0:int(0.75*Xready.shape[0])]) print 'Best params for %s = rankingMethod %d, algo %s, numFeatures %d' % (targetHeader[targetIdx][0:4], rankingMethod, algorithm, numFeatures) if hasattr(estimator, 'best_params_'): print estimator.best_params_ else: pprint(vars(estimator)) #---------------------------------------Test------------------------------------------------------------------------------------------- testHeader = getHeader(testFilename) filteredCols = getSimilarColIndex(header, testHeader) # print filteredCols if numCols != len(filteredCols) + 1: print 'Column Error' return 'Column Error' testset = np.genfromtxt(testFilename, delimiter=",", skip_header=1, autostrip=True, missing_values=np.nan, usecols=tuple(filteredCols)) XTest = testset[:,tuple(range(numTargets,testset.shape[1]))] yTest = testset[:,targetIdx] haveNotNull = np.sum(np.invert(np.isnan(XTest)),axis=0) missing = [] lowDense = [] for col in range(len(haveNotNull)): if haveNotNull[col] == 0: XTest[:,col] = np.array([Xscaler.mean_[col]]*XTest.shape[0]) missing.append(col) if haveNotNull[col] < 0.4*XTest.shape[0]: lowDense.append(col) print 'Low dense = %d' % (len(lowDense)) imp = Imputer(missing_values='NaN', strategy='median', axis=0) XTest = imp.fit_transform(XTest) keepRows = np.invert(np.isnan(yTest)) XTest = XTest[keepRows,:] yTest = yTest[keepRows] yTest = yTest.reshape(-1,1) Xscaler.transform(XTest) XTestReady = XTest[:,tuple(XIndex)] # predicted = estimator.predict(XTestReady) predicted = bootstrappingPrediction(estimator, Xready, y, XTestReady, B = 200, parametric = False) absolute_error = np.absolute(yTest - predicted) # predicted = estimator.predict(Xready[int(0.75*Xready.shape[0]):,:]) # absolute_error = np.absolute(y[int(0.75*Xready.shape[0]):] - predicted) score_mean = absolute_error.mean() score_sd = absolute_error.std() missingCount = len(list(set(missing) & set(XIndex))) lowDenseCount = len(list(set(lowDense) & set(XIndex))) # print score_mean, score_sd print '%s = rankingMethod %d, algo %s, numFeatures %d, score(mean, sd) = (%f,%f), missing = %d, lowDense = %d' % (targetHeader[targetIdx][0:4], rankingMethod, algorithm, numFeatures, score_mean, score_sd, missingCount, lowDenseCount)
from sklearn.model_selection import cross_val_score from sklearn.pipeline import make_pipeline from sklearn import preprocessing import matplotlib.pyplot as plt data = dI.dataImporter(shuffle=True, stratify=True) X_train, y_train = data.getTrainData() #X_train = X_train[:,:-3] # define 10-fold cross validation test harness #kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=3) #multiNB = make_pipeline(MultinomialNB()) #multinomial needs correct probability!!! clf = make_pipeline(MultinomialNB()) clf2 = make_pipeline(preprocessing.StandardScaler(), BernoulliNB()) clf3 = make_pipeline(preprocessing.StandardScaler(), GaussianNB()) scores = cross_val_score(clf, X_train[:, :-3], y_train, cv=10) scores2 = cross_val_score(clf2, X_train, y_train, cv=10) scores3 = cross_val_score(clf2, X_train, y_train, cv=10) print("Mean ROC_AUC Multi: %.2f%% (+/- %.2f%%)" % (scores.mean() * 100, scores.std() * 100)) print("Mean ROC_AUC Bernu: %.2f%% (+/- %.2f%%)" % (scores2.mean() * 100, scores2.std() * 100)) print("Mean ROC_AUC Gauss: %.2f%% (+/- %.2f%%)" % (scores3.mean() * 100, scores3.std() * 100)) # from sklearn.metrics import roc_curve # X_test, y_test = data.getTestData() # probas_ = clf.fit(X_train, y_train).predict_proba(X_test)
#[ 5.1 3.5 1.4 0.2] 0 from sklearn.cross_validation import train_test_split from sklearn import preprocessing # Get dataset with only the first two attributes X, y = X_iris[:, :2], y_iris # Split the dataset into a training and a testing set # Test set will be the 25% taken randomly X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33) print(X_train.shape, y_train.shape) #(112, 2) (112,) # Standardize the features scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) import matplotlib.pyplot as plt colors = ['red', 'greenyellow', 'blue'] #it doesnt like "xrange" changed to "range" for i in range(len(colors)): xs = X_train[:, 0][y_train == i] ys = X_train[:, 1][y_train == i] plt.scatter(xs, ys, c=colors[i]) plt.legend(iris.target_names) plt.xlabel('Sepal length') plt.ylabel('Sepal width') #found a typo here... incorrect from book followed by corrected code
def sort_features(features, outdir, name, header=None, verbose=0): """ Sort a feature array using a Ward hierachical clustering analysis on the rows and the columns. Parameters ---------- features: array (N, M) an array of features to be sorted. outdir: str the destination folder where the ouputs will be saved. name: str the name of the plot. header: list (M, ), default None the features names. verbose: int, default 0 the verbosity level. Returns ------- features_snap: str the sorted features representation. """ # Check inputs if verbose > 0: print("[info] Sorting features in array of shape " "'{0}'...".format(features.shape)) # Normalize features scaler = preprocessing.StandardScaler().fit(features) scaled_features = scaler.transform(features) # Use the seaborn template to create a pretty display with plt.style.context("seaborn-deep"): # Create a figure fig = plt.figure(figsize=(12, 12)) ax1 = fig.add_axes([0.17, 0.15, 0.1, 0.73]) ax2 = fig.add_axes([0.3, 0.89, 0.6, 0.1]) ax2.set_xticks([]) ax2.set_yticks([]) ax1.xaxis.set_visible(False) ax1.yaxis.set_visible(False) ax2.xaxis.set_visible(False) ax2.yaxis.set_visible(False) # Ward hierachical clustering analysis on the rows and the columns. linkage_array_row = scipy.cluster.hierarchy.linkage(scaled_features, method="ward", metric="euclidean") dendogram_1 = scipy.cluster.hierarchy.dendrogram(linkage_array_row, orientation="left", ax=ax1) linkage_array_col = scipy.cluster.hierarchy.linkage( scaled_features.transpose(), method="ward", metric="euclidean") dendogram_2 = scipy.cluster.hierarchy.dendrogram(linkage_array_col, ax=ax2) # Organize the input feature array idx1 = dendogram_1["leaves"] idx2 = dendogram_2["leaves"] axmatrix = fig.add_axes([0.3, 0.15, 0.6, 0.73]) matrix = scaled_features[:, idx2] matrix = matrix[idx1, :] # Render the organized feature matrix im = axmatrix.matshow(matrix, aspect="auto", origin="lower", cmap=plt.cm.get_cmap("Spectral"), vmin=-1, vmax=1) if header is not None: clusterized_labels = [header[i] for i in idx2] axmatrix.xaxis.set_visible(True) axmatrix.xaxis.set_label_position("bottom") axmatrix.xaxis.tick_bottom() axmatrix.set_xticks(range(len(header))) axmatrix.set_xticklabels(clusterized_labels, fontsize=8, rotation=-90) else: axmatrix.xaxis.set_visible(False) axmatrix.yaxis.set_visible(False) axcolor = fig.add_axes([0.91, 0.15, 0.02, 0.73]) plt.colorbar(im, cax=axcolor) plt.title("Organized features", fontsize=10) # Display/save the plot features_snap = os.path.join(outdir, name + ".png") plt.savefig(features_snap, format="png") return features_snap
pca = PCA(n_components=i + 1, svd_solver='auto') pca_data = pca.fit_transform(data) return pca, pca_data if __name__ == '__main__': abbrTrain = 'E:\python_project\happinessPredict\DataSet\happiness_train_abbr.csv' abbrTest = 'E:\python_project\happinessPredict\DataSet\happiness_test_abbr.csv' # trainData, happiness = readData.readData(abbrTrain, True) # # 对每一行的样本的同一位置的特征进行z-score标准化 # trainData = preprocessing.scale(trainData) # print(trainData.mean(axis=0)) # print(trainData.std(axis=0)) trainData, happiness = readData.readData(abbrTrain, True) scaler = preprocessing.StandardScaler().fit(trainData) trainData = scaler.transform(trainData) pca, pca_data = get_x_ratio(trainData, 0.95) testData = readData.readData(abbrTest, False) testData = scaler.transform(testData) # 绘制主成分直方图 ratio = pca.explained_variance_ratio_.tolist() plt.figure() plt.grid() plt.bar(range(len(ratio)), ratio, alpha=0.9, facecolor="lightskyblue",
#making a model to predict the quality of wine #splitting the data on quality y = data.quality x = data.drop('quality', axis=1) #splitting the data into training and test with 20% as test data x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123, stratify=y) #scaling the data using preprocessing library #without it we would minus the mean and divide by the standard deviation #using pipeline to make a pipeline between the model and the standard Scaler pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators=100)) #setting the hyperparameters hyperparameters = { 'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1] } #using cross validation to select the right parameters. Using GraphCV searches and fits the proper data clf = GridSearchCV(pipeline, hyperparameters, cv=10) #1 hold out and 9 train, iterated over 10 times #training to avoid overfitting and choosing optimal parameter clf.fit(x_train, y_train) #clf.best_params_ #displays the optimal chosen parameters #using the model to predict the test data y_pred = clf.predict(x_test)
def run_models(settings=None): analysis_scr = [] with_auc_score = settings['with_auc_score'] for subset_no in xrange(1, settings['number_iterations'] + 1): print("Subset:", subset_no) ################## generate data ################### array_A = [] array_B = [] for i in range(100000): array_A.append(np.random.random_integers(0, 59999)) array_B.append(np.random.random_integers(0, 59999)) pos_index = [] neg_index = [] for index in xrange(100000): if y_total[array_A[index]] - y_total[array_B[index]] == 1: pos_index.append(index) else: neg_index.append(index) print 'number of positive examples', len(pos_index) selected_neg_index = neg_index[:len(pos_index)] array_A = np.array(array_A) array_B = np.array(array_B) index_for_positive_image_A = array_A[pos_index] index_for_positive_image_B = array_B[pos_index] index_for_neg_image_A = array_A[selected_neg_index] index_for_neg_image_B = array_B[selected_neg_index] X_pos_A = X_total[index_for_positive_image_A] X_pos_B = X_total[index_for_positive_image_B] X_pos_whole = np.hstack((X_pos_A, X_pos_B)) X_neg_A = X_total[index_for_neg_image_A] X_neg_B = X_total[index_for_neg_image_B] X_neg_whole = np.hstack((X_neg_A, X_neg_B)) print X_pos_A.shape, X_pos_B.shape, X_pos_whole.shape print X_neg_A.shape, X_neg_B.shape, X_neg_whole.shape X_whole = np.vstack((X_pos_whole, X_neg_whole)) print X_whole.shape y_pos = np.ones(X_pos_whole.shape[0]) y_neg = np.zeros(X_neg_whole.shape[0]) y_whole = np.concatenate([y_pos, y_neg]) print y_whole.shape x_train_pre_validation, x_test, y_train_pre_validation, y_test = train_test_split( X_whole, y_whole, test_size=0.2, random_state=211) for number_of_training in settings['number_of_training']: x_train, x_validation, y_train, y_validation = train_test_split(x_train_pre_validation[:number_of_training], y_train_pre_validation[:number_of_training],\ test_size=0.2, random_state=21) ''' x_train, x_validation, y_train, y_validation = train_test_split(x_train_pre_validation[:], y_train_pre_validation[:],\ test_size=0.4, random_state=21) ''' print x_train.shape, y_train.shape, x_validation.shape, \ y_validation.shape, x_test.shape, y_test.shape x_train_minmax, x_validation_minmax, x_test_minmax = x_train, x_validation, x_test train_X_reduced = x_train train_y_reduced = y_train test_X = x_test test_y = y_test ###original data### ################ end of data #################### standard_scaler = preprocessing.StandardScaler().fit( train_X_reduced) scaled_train_X = standard_scaler.transform(train_X_reduced) scaled_test_X = standard_scaler.transform(test_X) if settings['SVM']: print "SVM" Linear_SVC = LinearSVC(C=1, penalty="l2") Linear_SVC.fit(scaled_train_X, y_train) predicted_test_y = Linear_SVC.predict(scaled_test_X) isTest = True #new analysis_scr.append( (subset_no, number_of_training, 'SVM', isTest) + tuple( performance_score(test_y, predicted_test_y).values())) #new predicted_train_y = Linear_SVC.predict(scaled_train_X) isTest = False #new analysis_scr.append( (subset_no, number_of_training, 'SVM', isTest) + tuple( performance_score(train_y_reduced, predicted_train_y).values())) if settings['SVM_RBF']: print "SVM_RBF" L1_SVC_RBF_Selector = SVC(C=1, gamma=0.01, kernel='rbf').fit( scaled_train_X, y_train) predicted_test_y = L1_SVC_RBF_Selector.predict(scaled_test_X) isTest = True #new analysis_scr.append( (subset_no, number_of_training, 'SVM_RBF', isTest) + tuple( performance_score(test_y, predicted_test_y).values())) #new predicted_train_y = L1_SVC_RBF_Selector.predict(scaled_train_X) isTest = False #new analysis_scr.append( (subset_no, number_of_training, 'SVM_RBF', isTest) + tuple( performance_score(train_y_reduced, predicted_train_y).values())) if settings['SVM_POLY']: print "SVM_POLY" L1_SVC_POLY_Selector = SVC(C=1, kernel='poly').fit( scaled_train_X, train_y_reduced) predicted_test_y = L1_SVC_POLY_Selector.predict(scaled_test_X) isTest = True #new analysis_scr.append( (subset_no, number_of_training, 'SVM_POLY', isTest) + tuple( performance_score(test_y, predicted_test_y).values())) #new predicted_train_y = L1_SVC_POLY_Selector.predict( scaled_train_X) isTest = False #new analysis_scr.append(( subset_no, number_of_training, 'SVM_POLY', isTest) + tuple( performance_score(train_y_reduced, predicted_train_y).values())) if settings['Log']: print "Log" log_clf_l2 = sklearn.linear_model.LogisticRegression( C=1, penalty='l2') log_clf_l2.fit(scaled_train_X, train_y_reduced) predicted_test_y = log_clf_l2.predict(scaled_test_X) isTest = True #new analysis_scr.append( (subset_no, number_of_training, 'Log', isTest) + tuple( performance_score(test_y, predicted_test_y).values())) #new predicted_train_y = log_clf_l2.predict(scaled_train_X) isTest = False #new analysis_scr.append( (subset_no, number_of_training, 'Log', isTest) + tuple( performance_score(train_y_reduced, predicted_train_y).values())) # direct deep learning finetune_lr = settings['finetune_lr'] batch_size = settings['batch_size'] pretraining_epochs = cal_epochs( settings['pretraining_interations'], x_train_minmax, batch_size=batch_size) #pretrain_lr=0.001 pretrain_lr = settings['pretrain_lr'] training_epochs = cal_epochs(settings['training_interations'], x_train_minmax, batch_size=batch_size) hidden_layers_sizes = settings['hidden_layers_sizes'] corruption_levels = settings['corruption_levels'] if settings['DL']: print "direct deep learning" sda = trainSda(x_train_minmax, y_train, x_validation_minmax, y_validation, x_test_minmax, test_y, hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \ training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, pretrain_lr = pretrain_lr, finetune_lr=finetune_lr ) print 'hidden_layers_sizes:', hidden_layers_sizes print 'corruption_levels:', corruption_levels test_predicted = sda.predict(x_test_minmax) isTest = True #new analysis_scr.append( (subset_no, number_of_training, 'DL', isTest) + tuple(performance_score(y_test, test_predicted).values())) training_predicted = sda.predict(x_train_minmax) isTest = False #new analysis_scr.append(( subset_no, number_of_training, 'DL', isTest ) + tuple( performance_score(y_train, training_predicted).values())) ####transformed original data#### x = train_X_reduced a_MAE_original = train_a_MultipleAEs( x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, hidden_layers_sizes=hidden_layers_sizes, corruption_levels=corruption_levels) new_x_train_minmax_A = a_MAE_original.transform(train_X_reduced) new_x_test_minmax_A = a_MAE_original.transform(x_test_minmax) standard_scaler = preprocessing.StandardScaler().fit( new_x_train_minmax_A) new_x_train_scaled = standard_scaler.transform( new_x_train_minmax_A) new_x_test_scaled = standard_scaler.transform(new_x_test_minmax_A) new_x_train_combo = np.hstack((scaled_train_X, new_x_train_scaled)) new_x_test_combo = np.hstack((scaled_test_X, new_x_test_scaled)) if settings['SAE_SVM']: # SAE_SVM print 'SAE followed by SVM' Linear_SVC = LinearSVC(C=1, penalty="l2") Linear_SVC.fit(new_x_train_scaled, train_y_reduced) predicted_test_y = Linear_SVC.predict(new_x_test_scaled) isTest = True #new analysis_scr.append( (subset_no, number_of_training, 'SAE_SVM', isTest) + tuple( performance_score(test_y, predicted_test_y).values())) #new predicted_train_y = Linear_SVC.predict(new_x_train_scaled) isTest = False #new analysis_scr.append( (subset_no, number_of_training, 'SAE_SVM', isTest) + tuple( performance_score(train_y_reduced, predicted_train_y).values())) if settings['SAE_Log']: print 'SAE followed by Log' log_clf_l2 = sklearn.linear_model.LogisticRegression( C=1, penalty='l2') log_clf_l2.fit(new_x_train_scaled, train_y_reduced) predicted_test_y = log_clf_l2.predict(new_x_test_scaled) isTest = True #new analysis_scr.append( (subset_no, number_of_training, 'SAE_Log', isTest) + tuple( performance_score(test_y, predicted_test_y).values())) #new predicted_train_y = log_clf_l2.predict(new_x_train_scaled) isTest = False #new analysis_scr.append( (subset_no, number_of_training, 'SAE_Log', isTest) + tuple( performance_score(train_y_reduced, predicted_train_y).values())) if settings['SAE_SVM_RBF']: # SAE_SVM print 'SAE followed by SVM RBF' L1_SVC_RBF_Selector = SVC(C=1, gamma=0.01, kernel='rbf').fit( new_x_train_scaled, train_y_reduced) predicted_test_y = L1_SVC_RBF_Selector.predict( new_x_test_scaled) isTest = True #new analysis_scr.append( (subset_no, number_of_training, 'SAE_SVM_RBF', isTest) + tuple( performance_score(test_y, predicted_test_y).values())) #new predicted_train_y = L1_SVC_RBF_Selector.predict( new_x_train_scaled) isTest = False #new analysis_scr.append( (subset_no, number_of_training, 'SAE_SVM_RBF', isTest) + tuple( performance_score(train_y_reduced, predicted_train_y).values())) if settings['SAE_SVM_POLY']: # SAE_SVM print 'SAE followed by SVM POLY' L1_SVC_RBF_Selector = SVC(C=1, kernel='poly').fit( new_x_train_scaled, train_y_reduced) predicted_test_y = L1_SVC_RBF_Selector.predict( new_x_test_scaled) isTest = True #new analysis_scr.append( (subset_no, number_of_training, 'SAE_SVM_POLY', isTest) + tuple( performance_score(test_y, predicted_test_y).values())) #new predicted_train_y = L1_SVC_RBF_Selector.predict( new_x_train_scaled) isTest = False #new analysis_scr.append( (subset_no, number_of_training, 'SAE_SVM_POLY', isTest) + tuple( performance_score(train_y_reduced, predicted_train_y).values())) #### separated transformed data #### y_test = test_y print 'deep learning using split network' # get the new representation for A set. first 784-D pretraining_epochs = cal_epochs( settings['pretraining_interations'], x_train_minmax, batch_size=batch_size) x = x_train_minmax[:, :x_train_minmax.shape[1] / 2] print "original shape for A", x.shape a_MAE_A = train_a_MultipleAEs( x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, hidden_layers_sizes=[x / 2 for x in hidden_layers_sizes], corruption_levels=corruption_levels) new_x_train_minmax_A = a_MAE_A.transform( x_train_minmax[:, :x_train_minmax.shape[1] / 2]) x = x_train_minmax[:, x_train_minmax.shape[1] / 2:] print "original shape for B", x.shape a_MAE_B = train_a_MultipleAEs( x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, hidden_layers_sizes=[x / 2 for x in hidden_layers_sizes], corruption_levels=corruption_levels) new_x_train_minmax_B = a_MAE_B.transform( x_train_minmax[:, x_train_minmax.shape[1] / 2:]) new_x_test_minmax_A = a_MAE_A.transform( x_test_minmax[:, :x_test_minmax.shape[1] / 2]) new_x_test_minmax_B = a_MAE_B.transform( x_test_minmax[:, x_test_minmax.shape[1] / 2:]) new_x_validation_minmax_A = a_MAE_A.transform( x_validation_minmax[:, :x_validation_minmax.shape[1] / 2]) new_x_validation_minmax_B = a_MAE_B.transform( x_validation_minmax[:, x_validation_minmax.shape[1] / 2:]) new_x_train_minmax_whole = np.hstack( (new_x_train_minmax_A, new_x_train_minmax_B)) new_x_test_minmax_whole = np.hstack( (new_x_test_minmax_A, new_x_test_minmax_B)) new_x_validationt_minmax_whole = np.hstack( (new_x_validation_minmax_A, new_x_validation_minmax_B)) standard_scaler = preprocessing.StandardScaler().fit( new_x_train_minmax_whole) new_x_train_minmax_whole_scaled = standard_scaler.transform( new_x_train_minmax_whole) new_x_test_minmax_whole_scaled = standard_scaler.transform( new_x_test_minmax_whole) if settings['DL_S']: # deep learning using split network sda_transformed = trainSda(new_x_train_minmax_whole, y_train, new_x_validationt_minmax_whole, y_validation , new_x_test_minmax_whole, y_test, hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \ training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, pretrain_lr = pretrain_lr, finetune_lr=finetune_lr ) print 'hidden_layers_sizes:', hidden_layers_sizes print 'corruption_levels:', corruption_levels predicted_test_y = sda_transformed.predict( new_x_test_minmax_whole) y_test = test_y isTest = True #new analysis_scr.append( (subset_no, number_of_training, 'DL_S', isTest) + tuple( performance_score(y_test, predicted_test_y, with_auc_score).values())) training_predicted = sda_transformed.predict( new_x_train_minmax_whole) isTest = False #new analysis_scr.append( (subset_no, number_of_training, 'DL_S', isTest) + tuple( performance_score(y_train, training_predicted, with_auc_score).values())) if settings['SAE_S_SVM']: print 'SAE_S followed by SVM' Linear_SVC = LinearSVC(C=1, penalty="l2") Linear_SVC.fit(new_x_train_minmax_whole_scaled, train_y_reduced) predicted_test_y = Linear_SVC.predict( new_x_test_minmax_whole_scaled) isTest = True #new analysis_scr.append( (subset_no, number_of_training, 'SAE_S_SVM', isTest) + tuple( performance_score(test_y, predicted_test_y, with_auc_score).values())) #new predicted_train_y = Linear_SVC.predict( new_x_train_minmax_whole_scaled) isTest = False #new analysis_scr.append( (subset_no, number_of_training, 'SAE_S_SVM', isTest) + tuple( performance_score(train_y_reduced, predicted_train_y, with_auc_score).values())) if settings['SAE_S_SVM_RBF']: print 'SAE S followed by SVM RBF' L1_SVC_RBF_Selector = SVC(C=1, gamma=0.01, kernel='rbf').fit( new_x_train_minmax_whole_scaled, train_y_reduced) predicted_test_y = L1_SVC_RBF_Selector.predict( new_x_test_minmax_whole_scaled) isTest = True #new analysis_scr.append( (subset_no, number_of_training, 'SAE_S_SVM_RBF', isTest) + tuple( performance_score(test_y, predicted_test_y, with_auc_score).values())) #new predicted_train_y = L1_SVC_RBF_Selector.predict( new_x_train_minmax_whole_scaled) isTest = False #new analysis_scr.append( (subset_no, number_of_training, 'SAE_S_SVM_RBF', isTest) + tuple( performance_score(train_y_reduced, predicted_train_y, with_auc_score).values())) if settings['SAE_S_SVM_POLY']: # SAE_SVM print 'SAE S followed by SVM POLY' L1_SVC_RBF_Selector = SVC(C=1, kernel='poly').fit( new_x_train_minmax_whole_scaled, train_y_reduced) predicted_test_y = L1_SVC_RBF_Selector.predict( new_x_test_minmax_whole_scaled) isTest = True #new analysis_scr.append( (subset_no, number_of_training, 'SAE_S_SVM_POLY', isTest) + tuple( performance_score(test_y, predicted_test_y, with_auc_score).values())) #new predicted_train_y = L1_SVC_RBF_Selector.predict( new_x_train_minmax_whole_scaled) isTest = False #new analysis_scr.append( (subset_no, number_of_training, 'SAE_S_SVM_POLY', isTest) + tuple( performance_score(train_y_reduced, predicted_train_y, with_auc_score).values())) report_name = 'DL_handwritten_digits' + '_size_'.join(map(str, hidden_layers_sizes)) + \ '_' + str(pretrain_lr) + '_' + str(finetune_lr) + '_' + \ '_' + str(settings['pretraining_interations']) + '_' + current_date saveAsCsv(with_auc_score, report_name, performance_score(test_y, predicted_test_y, with_auc_score), analysis_scr) return sda, a_MAE_original, a_MAE_A, a_MAE_B, analysis_scr
def __init__(self, X, Y): self.scaler = preprocessing.StandardScaler().fit(Y)
def __init__(self, X, Y): self.input_scaler = preprocessing.StandardScaler().fit(X) self.output_scaler = preprocessing.StandardScaler().fit(Y)
barmode="group", template="plotly_white", labels={ "var": "Variable", "value": "Value", "variable": "Statistic" }, color_discrete_sequence=px.colors.qualitative.Safe, log_y=True) st.write(feat_fig) st.write( "Wow, that's quite a significant discrepancy - let's scale these to a mean of zero and a standard deviation of 1" ) # Scale the features scaler = preprocessing.StandardScaler().fit(cont_df) X = scaler.transform(cont_df) # Prove that mean = 0, st deviation = 1 feat_desc = pd.DataFrame(X).describe().transpose().reset_index().rename( {'index': "var"}, axis=1) feat_fig = px.bar(feat_desc[['var', 'mean', 'std']].melt(id_vars=['var']), x="var", y="value", color="variable", barmode="group", template="plotly_white", labels={ "var": "Variable", "value": "Value", "variable": "Statistic" },
zdm = random.choice(ls_z_dim) lre = random.choice(ls_lr) epch = random.choice(ls_epoch) for train_index, test_index in skf.split(GDSCE.values, Y): k = k + 1 X_trainE = GDSCE.values[train_index, :] X_testE = GDSCE.values[test_index, :] X_trainM = GDSCM.values[train_index, :] X_testM = GDSCM.values[test_index, :] X_trainC = GDSCC.values[train_index, :] X_testC = GDSCM.values[test_index, :] y_trainE = Y[train_index] y_testE = Y[test_index] scalerGDSC = sk.StandardScaler() scalerGDSC.fit(X_trainE) X_trainE = scalerGDSC.transform(X_trainE) X_testE = scalerGDSC.transform(X_testE) X_trainM = np.nan_to_num(X_trainM) X_trainC = np.nan_to_num(X_trainC) X_testM = np.nan_to_num(X_testM) X_testC = np.nan_to_num(X_testC) TX_testE = torch.FloatTensor(X_testE) TX_testM = torch.FloatTensor(X_testM) TX_testC = torch.FloatTensor(X_testC) ty_testE = torch.FloatTensor(y_testE.astype(int)) #Train
import joblib import pandas as pd import numpy as np dat = pd.read_csv("trial.csv") dat.head(3) #Missing value check count_nan = dat.isnull().sum() count_nan #Fill in missing values dat['Money_Value'].fillna(dat['Money_Value'].mean(), inplace=True) dat.info() #Divide X and Y X, Y = dat.iloc[0:, 0:9], dat.iloc[0:, 9:] #standardization from sklearn import preprocessing zscore = preprocessing.StandardScaler() X = zscore.fit_transform(X) #spearman Correlation coefficient for feature selection dfs = dat.corr('spearman') #Calculate spearman correlation coefficient print(dfs) dfs["Ranking"] = dfs["Risk"].rank(method="first") display(dfs) #Print all spearman coefficient values dfs['sort_helper'] = dfs['Risk'].abs() dfs["absRanking"] = dfs["sort_helper"].rank(method="first") display(dfs["absRanking"]) #Ascending order #RFE for feature selection from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression
def select_scaler(): scaler = preprocessing.StandardScaler(with_mean=False) return scaler
def read_matdataset(self, opt): tic = time.time() src = "NUS-WIDE" #path contsining features att_path = os.path.join( src, 'word_embedding', 'NUS_WIDE_pretrained_w2v_glove-wiki-gigaword-300') file_tag1k = os.path.join(src, 'NUS_WID_Tags', 'TagList1k.txt') file_tag81 = os.path.join(src, 'ConceptsList', 'Concepts81.txt') self.seen_cls_idx, _ = get_seen_unseen_classes(file_tag1k, file_tag81) src_att = pickle.load(open(att_path, 'rb')) print("attributes are combined in this order-> seen+unseen") self.attribute = torch.from_numpy( normalize( np.concatenate((src_att[0][self.seen_cls_idx], src_att[1]), axis=0))) #VGG features path train_loc = util.load_dict_from_hdf5( os.path.join(src, 'nus_wide_paper_features', 'nus_seen_train_vgg19.h5')) test_unseen_loc = util.load_dict_from_hdf5( os.path.join(src, 'nus_wide_paper_features', 'nus_zsl_test_vgg19.h5')) test_seen_unseen_loc = util.load_dict_from_hdf5( os.path.join(src, 'nus_wide_paper_features', 'nus_gzsl_test_vgg19.h5')) feature_train_loc = train_loc['features'] label_train_loc = train_loc['labels'] feature_test_unseen_loc = test_unseen_loc['features'] label_test_unseen_loc = test_unseen_loc['labels'] feature_test_seen_unseen_loc = test_seen_unseen_loc['features'] label_test_seen_unseen_loc = test_seen_unseen_loc['labels'] print("Data loading finished, Time taken: {}".format(time.time() - tic)) tic = time.time() if not opt.validation: if opt.preprocessing: if opt.standardization: print('standardization...') scaler = preprocessing.StandardScaler() else: scaler = preprocessing.MinMaxScaler() _train_feature = scaler.fit_transform(feature_train_loc) _test_unseen_feature = scaler.transform( feature_test_unseen_loc) _test_seen_unseen_feature = scaler.transform( feature_test_seen_unseen_loc) self.train_feature = torch.from_numpy(_train_feature).float() mx = self.train_feature.max() self.train_feature.mul_(1 / mx) self.train_label = torch.from_numpy(label_train_loc).long() self.test_unseen_feature = torch.from_numpy( _test_unseen_feature).float() self.test_unseen_feature.mul_(1 / mx) self.test_unseen_label = torch.from_numpy( label_test_unseen_loc).long() self.test_seen_unseen_feature = torch.from_numpy( _test_seen_unseen_feature).float() self.test_seen_unseen_feature.mul_(1 / mx) self.test_seen_unseen_label = torch.from_numpy( label_test_seen_unseen_loc).long() else: self.train_feature = torch.from_numpy( feature_train_loc).float() self.train_label = torch.from_numpy(label_train_loc).long() self.test_unseen_feature = torch.from_numpy( feature_test_unseen_loc).float() self.test_unseen_label = torch.from_numpy( label_test_unseen_loc).long() print("REMOVING ZEROS LABELS") temp_label = torch.clamp(self.train_label, 0, 1) temp_seen_labels = temp_label.sum(1) temp_label = temp_label[temp_seen_labels > 0] self.train_label = temp_label self.train_feature = self.train_feature[temp_seen_labels > 0] self.train_trimmed_label = self.train_label[temp_label.sum(1) <= opt.N] self.train_trimmed_feature = self.train_feature[ temp_label.sum(1) <= opt.N] print("Data with N={} labels={}".format( opt.N, self.train_trimmed_label.shape)) print( "Full Data labels={} with min label/feature = {} and max label/feature = {}" .format(self.train_label.shape, temp_label.sum(1).min(), temp_label.sum(1).max())) self.seenclasses = torch.from_numpy( np.arange(0, self.seen_cls_idx.shape[-1])) # [0-925] self.unseenclasses = torch.from_numpy( np.arange(0 + self.seen_cls_idx.shape[-1], len(self.attribute))) # [925-1006] self.N = opt.N self.syn_num = opt.syn_num self.per_seen = opt.per_seen self.per_unseen = opt.per_unseen self.per_seen_unseen = opt.per_seen_unseen print("USING TRAIN FEATURES WITH <=N") self.ntrain = self.train_trimmed_feature.size()[0] train_labels = self.train_trimmed_label self.ntest_unseen = self.test_unseen_feature.size()[0] self.ntrain_class = self.seenclasses.size(0) self.ntest_class = self.unseenclasses.size(0) self.train_class = self.seenclasses.clone() self.allclasses = torch.arange(0, self.ntrain_class + self.ntest_class).long() self.GZSL_fake_test_labels = generate_fake_test_from_train_labels(train_labels, self.attribute, self.seenclasses, \ self.unseenclasses, self.syn_num, self.per_seen, self.per_unseen, self.per_seen_unseen) print("Data preprocssing finished, Time taken: {}".format(time.time() - tic))
# In[69]: train.columns # In[70]: cols_for_ss = ['perc_premium_paid_by_cash_credit', 'age_in_years', 'Income', 'Count_3-6_months_late', 'Count_6-12_months_late', 'Count_more_than_12_months_late', 'application_underwriting_score', 'no_of_premiums_paid', 'sourcing_channel', 'residence_area_type'] scaler = preprocessing.StandardScaler().fit(train[cols_for_ss]) train[cols_for_ss] = scaler.transform(train[cols_for_ss]) test[cols_for_ss] = scaler.transform(test[cols_for_ss]) print(scaler.mean_) # In[71]: train.head() # # Build Training and Testing Model # In[72]:
Features = loan_df[['Principal', 'terms', 'age', 'Gender', 'weekend']] Features = pd.concat([Features, pd.get_dummies(loan_df['education'])], axis=1) Features.drop(['Master or Above'], axis=1, inplace=True) # Percentage of population is too small Features.head() # Feature Sets: X X = Features X[0:5] # Label: y y = loan_df['loan_status'].values y[0:5] # Normalize Data - Equilize the range and data variabliity. This reduces bias from feature size difference X = preprocessing.StandardScaler().fit(X).transform(X.astype(float)) X[0:5] '''************************************''' ''' CLASSIFICATION MODELING ''' '''*************************************''' # K Nearest Neighbor (KNN) # Decision Tree # Support Vector Machine (SVM) # Logistic Regression print(loan_df.dtypes) '''************************************''' ''' KNN - K-Nearest Neighbors ''' '''*************************************''' from sklearn.model_selection import train_test_split
p_traj_clean_df['id'] += end_id end_id = p_traj_clean_df['id'].values[-1] compute_x2(p_traj_clean_df) compute_x2(hits, prefix='') p_traj_clean_all.append(p_traj_clean_df) xyz = hits.loc[:, ['x', 'y', 'z']].values.transpose() rtp = cart2spherical(xyz).transpose() rtp_df = pd.DataFrame(rtp, columns=('r', 'theta', 'phi')) hits = pd.concat((hits, rtp_df), axis=1) hits_all.append(hits) if i > -1: break scl = preprocessing.StandardScaler() # clf = LinearDiscriminantAnalysis(n_components=None) clf = LFDA(k=2) # clf = NCA() # clf = LMNN(k=2) # clf = RCA_Supervised() X_cols = ('r', 'theta', 'phi', 'x', 'y', 'z', 'x2', 'y2', 'z2') tX_cols = ('r', 'theta', 'phi', 'tx', 'ty', 'tz', 'tx2', 'ty2', 'tz2') # X_cols = ('x2', 'y2', 'z2') # tX_cols = ('tx2', 'ty2', 'tz2') p_traj_clean_cat = pd.concat(p_traj_clean_all, ignore_index=True) hits_cat = pd.concat(hits_all, ignore_index=True) X_scale = scl.fit_transform(hits_cat.loc[:, X_cols].values) X_clean_scale = scl.transform(p_traj_clean_cat.loc[:, tX_cols].values)
#implement scikit-learn train_test_split function #random_state is a seed file number so we can reproduce results X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123, stratify=y) #stratify--make sure training set looks similar to test set to make eval more reliable #now standardize (subtract mean and divide difference by SD) #scikit-learn simple scalin = X_train_scaled = preprocessing.scale(X_train) #only works on training set not test set #transformer api in scikit allows for fitting current and future data sets #fit transformer to training set scaler = preprocessing.StandardScaler().fit(X_train) #apply transformer to training set X_train_scaled = scaler.transform(X_train) print X_train_scaled.mean(axis=0) print X_train_scaled.std(axis=0) #apply transformer to test set X_test_scaled = scaler.transform(X_test) print X_test_scaled.mean(axis=0) print X_test_scaled.std(axis=0) #modeling pipeling = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators=1000)) pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators=100)) #set hyperparameters hyperparameters = { 'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1]
weight = np.random.normal(loc=70, scale=10, size=1000).reshape(-1, 1) print(height.shape, weight.shape) height[:5], weight[:5] # %% # point (x,y) original_data = np.concatenate((height, weight), axis=1) print(original_data.shape) original_data[:5] # %% plot(original_data, 'Original') # %% # 缩放到均值为0, 方差为1 standard_scaler_data = preprocessing.StandardScaler().fit_transform( original_data) plot(standard_scaler_data, 'StandardScaler') # %% # 缩放到0和1之间 min_max_scaler_data = preprocessing.MinMaxScaler().fit_transform(original_data) plot(min_max_scaler_data, 'MinMaxScaler') # %% # 缩放到-1和1之间 max_abs_scaler_data = preprocessing.MaxAbsScaler().fit_transform(original_data) plot(max_abs_scaler_data, 'MaxAbsScaler') # %% # 缩放到0和1之间,保留原始数据的分布 normalizer_data = preprocessing.Normalizer().fit_transform(original_data)
header=None, low_memory=False) df, _ = prep_data('') print(df.describe()) print('=== linear regression ===') regr = linear_model.LinearRegression() print('r2 = %.2f' % cross_val_score( regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='r2').mean()) print('rmse = %.2f' % np.sqrt(-1 * cross_val_score(regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='neg_mean_squared_error')).mean()) regr = Pipeline([('trans', preprocessing.StandardScaler()), ('regr', regr)]) print('r2 = %.2f' % cross_val_score( regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='r2').mean()) print('rmse = %.2f' % np.sqrt(-1 * cross_val_score(regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='neg_mean_squared_error')).mean()) print('=== ridge ===') regr = linear_model.Ridge(alpha=.05) print(cross_val_score(regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10).mean()) print('=== lasso ===') regr = linear_model.Lasso(alpha=.05) print(cross_val_score(regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10).mean()) print('=== Poly Linear ===')
col_ind = np.repeat(np.arange(N).reshape([-1, N]), N, axis=0) row_ind = row_ind[mask_tri] col_ind = col_ind[mask_tri] #%% classifier and learning parameters # MLR adapted for recursive feature elimination (RFE) class RFE_pipeline(skppl.Pipeline): def fit(self, X, y=None, **fit_params): """simply extends the pipeline to recover the coefficients (used by RFE) from the last element (the classifier) """ super(RFE_pipeline, self).fit(X, y, **fit_params) self.coef_ = self.steps[-1][-1].coef_ return self c_MLR = RFE_pipeline([('std_scal', skprp.StandardScaler()), ('clf', skllm.LogisticRegression(C=10, penalty='l2', multi_class='multinomial', solver='lbfgs', max_iter=500))]) # nearest neighbor c_1NN = sklnn.KNeighborsClassifier(n_neighbors=1, algorithm='brute', metric='correlation') # cross-validation scheme cv_schem = skms.StratifiedShuffleSplit(n_splits=1, test_size=0.2) n_rep = 10 # number of repetitions
feature_vectors.insert(1, 'FormationNum', 0) for ii, formation in enumerate(feature_vectors['Formation'].unique()): feature_vectors.FormationNum[feature_vectors.Formation == formation] = ii feature_vectors = feature_vectors.drop(['Formation'], axis=1) # *** # Normalizing and splitting data # In[3]: from sklearn import preprocessing from sklearn.model_selection import train_test_split scaler = preprocessing.StandardScaler().fit(feature_vectors) scaled_features = scaler.transform(feature_vectors) X_train, X_test, y_train, y_test = train_test_split(scaled_features, correct_facies_labels, test_size=0.2, random_state=42) #%% Use tpot from tpot import TPOTClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier #tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, # max_eval_time_mins = 20, max_time_mins=100, scoring='f1_micro') #tpot.fit(X_train, y_train)
robot_state = pd.concat([robot_positions, robot_velocitys], axis=1) else: robot_state_single_cam = pd.concat([robot_positions, robot_velocitys], axis=1) robot_state = pd.concat([robot_state, robot_state_single_cam], axis=0) robot_states_list = robot_state.values.tolist() list_ = [] for i in range(0, len(robot_state[0]), 4): list_.append(robot_states_list[i]) robot_states_frame_rate = pd.DataFrame(list_) print("Done") ################################## Standardization for Robot States ################################################################### robot_state_names = robot_states_frame_rate.columns scaler = preprocessing.StandardScaler() myScaler = scaler.fit(robot_states_frame_rate) robot_states_frame_rate = myScaler.transform(robot_states_frame_rate) robot_states_frame_rate = pd.DataFrame(robot_states_frame_rate, columns=robot_state_names) print(robot_states_frame_rate.shape) ################################## Load Strawberry Data ################################################################## strawberry_1 = pd.read_csv('/content/data_set_003/straw_1/data_set_' + str(0) + '_strawberry_data_store_1.csv', delimiter=',', error_bad_lines=False, header=None) strawberry_2 = pd.read_csv('/content/data_set_003/straw_2/data_set_' + str(0) + '_strawberry_data_store_2.csv', delimiter=',',
def read_hdf_data_psi(path = 'premix_data', key='of_tables', in_labels=['zeta','f','pv'], labels = ['T'], scaler = None): # read in the hdf5 file # AND COMPUTE PSI OF THE MIXTURE try: df = pd.read_hdf(path,key=key) except: print('Check the data path and key') # read the molar weigths with open('molar_weights.json', 'r') as fp: molar_weights = json.load(fp) # read in the order of the species names with open('GRI_species_order') as f: all_species = f.read().splitlines() # numpy array of species molar weights molar_weights_np = np.array([molar_weights[s] for s in all_species]) molar_weights_np = molar_weights_np/ 1000 # conversion from g to kg! This is needed for OpenFOAM T_vector = df['T'].as_matrix() # convert to ndarray gri_mass_frac = df[all_species].as_matrix() # COMPUTE THE CORRECT PSI VALUE R_universal = 8.314459 psi_list = [] print('Starting to compute psi ... ') # iterate over all rows for index in range(0,df.shape[0]): R_m = R_universal * sum(gri_mass_frac[index,:] / molar_weights_np) #df['psi'].iloc[index] = 1 / (R_m * row['T']) psi_list.append(1/(R_m * T_vector[index])) # print(index) # hand back the data to df df['psi'] = psi_list print('Done with psi!\n') input_df=df[in_labels] if scaler=='MinMax': in_scaler = preprocessing.MinMaxScaler() out_scaler = preprocessing.MinMaxScaler() elif scaler=='Standard': in_scaler = preprocessing.StandardScaler() out_scaler = preprocessing.StandardScaler() else: raise ValueError('Only possible scalers are: MinMax or Standard.') input_np = in_scaler.fit_transform(input_df) label_df=df[labels] label_np = out_scaler.fit_transform(label_df) print('\n*******************************') print('The scaler is %s\n' % scaler) print('This is the order of the labels:') [print(f) for f in labels] print('*******************************\n') return input_np, label_np, df, in_scaler, out_scaler
def crossValidationOneOption(filename, targetIdx, rankingMethod, algorithm, numFeatures, cv=5): """ This function evaluates the performance of the specified algorithm to predict target variable in the next year using cross validation and mean absolute value. This function is used in iteration 2. ** This function is hard coded. Please be careful while editing. ** Parameters ---------- filename : string The name of stacked file in iteration 2. targetIdx : int The index of target variable in the target list. rankingMethod : int The number specified how predictors are ranked. (See the file of ranked predictor lists) algorithm : string (usually 3-character) The code of prediction algorithm such as 'RID' for ridge regression. numFeatures : int The number of features included in the model. cv : int The number of fold for cross validation. Return Value ---------- None, but the results (error, sd, time spent) are printed. """ # filename = '2006-2013_FilteredColsTargetMissingBlank.csv' header = getHeader(filename) startTargetIndex, startPredictorIndex, numCols = getTargetAndPredictorIndex(header) numTargets = startPredictorIndex - startTargetIndex numPredictors = numCols - startPredictorIndex predictorHeader = header[startPredictorIndex:] targetHeader = header[startTargetIndex:startPredictorIndex] targetIDList = [int(head[0:4]) for head in targetHeader] dataset = np.genfromtxt(filename, delimiter=",", skip_header=1, autostrip=True, missing_values=np.nan, usecols=tuple(range(startTargetIndex,numCols))) # for targetIdx in range(0,numTargets): X = dataset[:,tuple(range(numTargets,dataset.shape[1]))] y = dataset[:,targetIdx] keepRows = np.invert(np.isnan(y)) X = X[keepRows,:] y = y[keepRows] y = y.reshape(-1,1) Xscaler = preprocessing.StandardScaler().fit(X) Xscaler.transform(X) Yscaler = preprocessing.StandardScaler().fit(y) # Yscaler.transform(y) # print 'Target %d => Mean %.5f , STD %.5f' % (targetIdx, Yscaler.mean_, Yscaler.scale_) estimator = getEstimator(algorithm,Yscaler,numFeatures) if estimator is None: return algorithm + ': Wrong Algorithm' XIndex = getFeaturesIndex(predictorHeader,'Feature Selection 2006-2013',targetIDList[targetIdx],rankingMethod,numFeatures) Xready = X[:,tuple(XIndex)] y = np.ravel(y) startTime = time.time() # crossValScoreList = cross_val_score(estimator, Xready, y, cv=cv, scoring='mean_absolute_error') predicted = cross_val_predict(estimator, Xready, y, cv=cv) absolute_error = np.absolute(y - predicted) score_mean = absolute_error.mean() score_sd = absolute_error.std() timeProcessed = time.time()-startTime print '%s = rankingMethod %d, algo %s, numFeatures %d, score(mean, sd) = (%f,%f), time = %f' % (targetHeader[targetIdx], rankingMethod, algorithm, numFeatures, score_mean, score_sd, timeProcessed)
def define_regressors(): ''' Define regressors to train the data with. All possible regressors should be added here. Regressors can be simple ones or pipelines that include standardisation or anything else. The parameters for the regressors are hard coded since they are expected to more or less stay constant once tuned. TODO: Include a feature selection method in the pipeline? That way it can be done automatically separately in each energy bin. (see https://scikit-learn.org/stable/modules/feature_selection.html). Returns ------- A dictionary of regressors to train. ''' regressors = dict() regressors['random_forest'] = RandomForestRegressor(n_estimators=300, random_state=0, n_jobs=8) regressors['MLP'] = make_pipeline( preprocessing.QuantileTransformer(output_distribution='normal', random_state=0), MLPRegressor( hidden_layer_sizes=(80, 45), solver='adam', max_iter=20000, activation='tanh', tol=1e-5, # early_stopping=True, random_state=0 ) ) regressors['MLP_relu'] = make_pipeline( preprocessing.QuantileTransformer(output_distribution='normal', random_state=0), MLPRegressor( hidden_layer_sizes=(100, 50), solver='adam', max_iter=20000, activation='relu', tol=1e-5, # early_stopping=True, random_state=0 ) ) regressors['MLP_logistic'] = make_pipeline( preprocessing.QuantileTransformer(output_distribution='normal', random_state=0), MLPRegressor( hidden_layer_sizes=(80, 45), solver='adam', max_iter=20000, activation='logistic', tol=1e-5, # early_stopping=True, random_state=0 ) ) regressors['MLP_uniform'] = make_pipeline( preprocessing.QuantileTransformer(output_distribution='uniform', random_state=0), MLPRegressor( hidden_layer_sizes=(80, 45), solver='adam', max_iter=20000, activation='tanh', tol=1e-5, # early_stopping=True, random_state=0 ) ) regressors['MLP_small'] = make_pipeline( preprocessing.QuantileTransformer(output_distribution='normal', random_state=0), MLPRegressor( hidden_layer_sizes=(36, 6), solver='adam', max_iter=20000, activation='tanh', tol=1e-5, # early_stopping=True, random_state=0 ) ) regressors['MLP_lbfgs'] = make_pipeline( preprocessing.QuantileTransformer(output_distribution='normal', random_state=0), MLPRegressor( hidden_layer_sizes=(36, 6), solver='lbfgs', max_iter=20000, activation='logistic', tol=1e-5, # early_stopping=True, random_state=0 ) ) regressors['BDT'] = AdaBoostRegressor( DecisionTreeRegressor(max_depth=30, random_state=0), n_estimators=1000, random_state=0 ) regressors['linear_regression'] = LinearRegression(n_jobs=4) regressors['ridge'] = Ridge(alpha=1.0) regressors['SVR'] = SVR(C=10.0, epsilon=0.2) regressors['linear_SVR'] = make_pipeline( preprocessing.StandardScaler(), LinearSVR(random_state=0, tol=1e-5, C=10.0, epsilon=0.2, max_iter=100000) ) regressors['SGD'] = make_pipeline( preprocessing.StandardScaler(), SGDRegressor(loss='epsilon_insensitive', max_iter=20000, tol=1e-5) ) return regressors
def standard_scale(X_train, X_test): preprocessor = prep.StandardScaler().fit(X_train) X_train = preprocessor.transform(X_train) X_test = preprocessor.transform(X_test) return X_train, X_test
# In[18]: y = df['loan_status'].values y[0:5] # ## Normalize Data # Data Standardization give data zero mean and unit variance (technically should be done after train test split ) # In[19]: X= preprocessing.StandardScaler().fit(X).transform(X) X[0:5] # # Classification # We use the test set to report the accuracy of the model # we gona use the following algorithm: # - K Nearest Neighbor(KNN) # - Decision Tree # - Support Vector Machine # - Logistic Regression # # # K Nearest Neighbor(KNN) #