Example #1
0
def main(args, config):
    # Set seed and import packages
    # NOTE: This need to be done before any keras module is imported!
    logger.debug("Import packages and set random seed to %s.",
                 int(config["seed"]))
    import numpy as np
    np.random.seed(int(config["seed"]))

    import ROOT
    ROOT.PyConfig.IgnoreCommandLineOptions = True  # disable ROOT internal argument parser
    import root_numpy

    from sklearn import preprocessing, model_selection
    import keras_models
    from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

    # Extract list of variables
    variables = config["variables"]
    logger.debug("Use variables:")
    for v in variables:
        logger.debug("%s", v)

    # Load training dataset
    filename = config["datasets"][args.fold]
    logger.debug("Load training dataset from %s.", filename)
    x = []
    y = []
    w = []
    rfile = ROOT.TFile(filename, "READ")
    classes = config["classes"]
    for i_class, class_ in enumerate(classes):
        logger.debug("Process class %s.", class_)
        tree = rfile.Get(class_)
        if tree == None:
            logger.fatal("Tree %s not found in file %s.", class_, filename)
            raise Exception

        # Get inputs for this class
        x_class = np.zeros((tree.GetEntries(), len(variables)))
        x_conv = root_numpy.tree2array(tree, branches=variables)
        for i_var, var in enumerate(variables):
            x_class[:, i_var] = x_conv[var]
        x.append(x_class)

        # Get weights
        w_class = np.zeros((tree.GetEntries(), 1))
        w_conv = root_numpy.tree2array(tree,
                                       branches=[config["event_weights"]])
        w_class[:, 0] = w_conv[
            config["event_weights"]] * config["class_weights"][class_]
        w.append(w_class)

        # Get targets for this class
        y_class = np.zeros((tree.GetEntries(), len(classes)))
        y_class[:, i_class] = np.ones((tree.GetEntries()))
        y.append(y_class)

    # Stack inputs, targets and weights to a Keras-readable dataset
    x = np.vstack(x)  # inputs
    y = np.vstack(y)  # targets
    w = np.vstack(w) * config["global_weight_scale"]  # weights
    w = np.squeeze(w)  # needed to get weights into keras

    # Perform input variable transformation and pickle scaler object
    logger.info("Use preprocessing method %s.", config["preprocessing"])
    if "standard_scaler" in config["preprocessing"]:
        scaler = preprocessing.StandardScaler().fit(x)
        for var, mean, std in zip(variables, scaler.mean_, scaler.scale_):
            logger.debug("Preprocessing (variable, mean, std): %s, %s, %s",
                         var, mean, std)
    elif "identity" in config["preprocessing"]:
        scaler = preprocessing.StandardScaler().fit(x)
        for i in range(len(scaler.mean_)):
            scaler.mean_[i] = 0.0
            scaler.scale_[i] = 1.0
        for var, mean, std in zip(variables, scaler.mean_, scaler.scale_):
            logger.debug("Preprocessing (variable, mean, std): %s, %s, %s",
                         var, mean, std)
    elif "robust_scaler" in config["preprocessing"]:
        scaler = preprocessing.RobustScaler().fit(x)
        for var, mean, std in zip(variables, scaler.center_, scaler.scale_):
            logger.debug("Preprocessing (variable, mean, std): %s, %s, %s",
                         var, mean, std)
    elif "min_max_scaler" in config["preprocessing"]:
        scaler = preprocessing.MinMaxScaler(feature_range=(-1.0, 1.0)).fit(x)
        for var, min_, max_ in zip(variables, scaler.data_min_,
                                   scaler.data_max_):
            logger.debug("Preprocessing (variable, min, max): %s, %s, %s", var,
                         min_, max_)
    elif "quantile_transformer" in config["preprocessing"]:
        scaler = preprocessing.QuantileTransformer(
            output_distribution="normal",
            random_state=int(config["seed"])).fit(x)
    else:
        logger.fatal("Preprocessing %s is not implemented.",
                     config["preprocessing"])
        raise Exception
    x = scaler.transform(x)

    path_preprocessing = os.path.join(
        config["output_path"],
        "fold{}_keras_preprocessing.pickle".format(args.fold))
    logger.info("Write preprocessing object to %s.", path_preprocessing)
    pickle.dump(scaler, open(path_preprocessing, 'wb'))

    # Split data in training and testing
    x_train, x_test, y_train, y_test, w_train, w_test = model_selection.train_test_split(
        x,
        y,
        w,
        test_size=1.0 - config["train_test_split"],
        random_state=int(config["seed"]))

    # Add callbacks
    callbacks = []
    if "early_stopping" in config["model"]:
        logger.info("Stop early after %s tries.",
                    config["model"]["early_stopping"])
        callbacks.append(
            EarlyStopping(patience=config["model"]["early_stopping"]))

    path_model = os.path.join(config["output_path"],
                              "fold{}_keras_model.h5".format(args.fold))
    if "save_best_only" in config["model"]:
        if config["model"]["save_best_only"]:
            logger.info("Write best model to %s.", path_model)
            callbacks.append(
                ModelCheckpoint(path_model, save_best_only=True, verbose=1))

    if "reduce_lr_on_plateau" in config["model"]:
        logger.info("Reduce learning-rate after %s tries.",
                    config["model"]["reduce_lr_on_plateau"])
        callbacks.append(
            ReduceLROnPlateau(patience=config["model"]["reduce_lr_on_plateau"],
                              verbose=1))

    # Train model
    if not hasattr(keras_models, config["model"]["name"]):
        logger.fatal("Model %s is not implemented.", config["model"]["name"])
        raise Exception
    logger.info("Train keras model %s.", config["model"]["name"])

    if config["model"]["batch_size"] < 0:
        batch_size = x_train.shape[0]
    else:
        batch_size = config["model"]["batch_size"]

    model_impl = getattr(keras_models, config["model"]["name"])
    model = model_impl(len(variables), len(classes))
    model.summary()
    fitResults = model.fit(x_train,
                           y_train,
                           sample_weight=w_train,
                           validation_data=(x_test, y_test, w_test),
                           batch_size=batch_size,
                           nb_epoch=config["model"]["epochs"],
                           shuffle=True,
                           callbacks=callbacks)

    # Save model
    if not "save_best_only" in config["model"]:
        logger.info("Write model to %s.", path_model)
        model.save(path_model)

    with open(path_model[:-3] + "_history.pkl", "wb") as file:
        pickle.dump(fitResults.history, file)
Example #2
0
def validationLargeFile(filename, testFilename, cv=5):
	"""
	This function evaluates the performance of all possible algorithms to predict target variable in the next year
	using train-validate-test validation and mean absolute value.

	This function is used in iteration 2.

	** This function is hard coded. Please be careful while editing. **

	Parameters
	----------
	filename : string
		The name of stacked file used as training data in iteration 2.

	testFilename : string
		The name of file used as test data in iteration 2.

		cv : int
		The number of fold for cross validation. # is not used in this function

	Return Value
	----------
	None, but the results are saved as files instead.

	"""
	# filename = '2006-2013_FilteredColsTargetMissingBlank.csv'
	header = getHeader(filename)
	startTargetIndex, startPredictorIndex, numCols = getTargetAndPredictorIndex(header)
	numTargets = startPredictorIndex - startTargetIndex
	numPredictors = numCols - startPredictorIndex
	predictorHeader = header[startPredictorIndex:]
	targetHeader = header[startTargetIndex:startPredictorIndex]
	targetIDList = [int(head[0:4]) for head in targetHeader]
	
	collectBest = [['targetIdx', 'rankingMethod', 'algorithm', 'numFeatures', 'score', 'sd', 'timeProcessed']]

	dataset = np.genfromtxt(filename, delimiter=",", skip_header=1, autostrip=True, missing_values=np.nan, usecols=tuple(range(startTargetIndex,numCols)))
	testset = np.genfromtxt(testFilename, delimiter=",", skip_header=1, autostrip=True, missing_values=np.nan, usecols=tuple(range(startTargetIndex,numCols)))

	for targetIdx in range(0,numTargets):
		# Training Data ---------------------------------------------------
		X = dataset[:,tuple(range(numTargets,dataset.shape[1]))]
		y = dataset[:,targetIdx]

		keepRows = np.invert(np.isnan(y))
		X = X[keepRows,:]
		y = y[keepRows]
		y = y.reshape(-1,1)

		Xscaler = preprocessing.StandardScaler().fit(X)
		Xscaler.transform(X)

		# Test Data ---------------------------------------------------
		XTest = testset[:,tuple(range(numTargets,testset.shape[1]))]
		yTest = testset[:,targetIdx]

		keepRowsTest = np.invert(np.isnan(yTest))
		XTest = XTest[keepRowsTest,:]
		yTest = yTest[keepRowsTest]
		yTest = yTest.reshape(-1,1)
		Xscaler.transform(XTest)


		Yscaler = preprocessing.StandardScaler().fit(np.concatenate((y,yTest)))
		# Yscaler.transform(y)

		algos = ['SVL','RBF', 'LAS', 'RID', 'ELA', 'MLP','ML1','ML2','ML3','ML4','ML5','ML6','ML7','ML8','ML9']	
		print 'Target %d => Mean %.5f , STD %.5f, Min %.5f, Max %.5f' % (targetIdx, Yscaler.mean_, Yscaler.scale_, np.concatenate((y,yTest)).min(), np.concatenate((y,yTest)).max())
		Yscaler = preprocessing.StandardScaler().fit(y)
		
		# continue
		rows = []
		for rankingMethod in [0,1,2,3]:		
			# numFeaturesTest = [5]
			numFeaturesTest = range(1,51)

			best = [targetIdx, rankingMethod, None,None,None,None,None]
			scoreTable = [numFeaturesTest]
			timeTable = [numFeaturesTest]
			sdTable = [numFeaturesTest]

			for algorithm in algos:

				scoreList = []
				timeList = []
				sdList = []
				
				for numFeatures in numFeaturesTest:
					estimator = getEstimator(algorithm,Yscaler,numFeatures)
					if estimator is None:
						return algorithm + ': Wrong Algorithm'
					XIndex = getFeaturesIndex(predictorHeader,'Feature Selection 2006-2013',targetIDList[targetIdx],rankingMethod,numFeatures)
					Xready = X[:,tuple(XIndex)]
					y = np.ravel(y)

					startTime = time.time()

					# estimator.fit(Xready,y)
					# XTestReady = XTest[:,tuple(XIndex)]
					# predicted = estimator.predict(XTestReady)
					# absolute_error = np.absolute(yTest - predicted)


					firstTestIndex = int(0.75*Xready.shape[0])
					estimator.fit(Xready[0:firstTestIndex,:],y[0:firstTestIndex])
					predicted = estimator.predict(Xready[firstTestIndex:,:])
					absolute_error = np.absolute(y[firstTestIndex:] - predicted)
					
				
					score = absolute_error.mean()
					sd = absolute_error.std()

					timeProcessed = time.time()-startTime
					scoreList.append(score)
					sdList.append(sd)
					timeList.append(timeProcessed)
					# print targetIdx, rankingMethod, algorithm, numFeatures, score, sd, timeProcessed
					if best[2] is None:
						best = [targetIdx, rankingMethod, algorithm, numFeatures, score, sd, timeProcessed]
					else:
						if score < best[4]:
							best = [targetIdx, rankingMethod, algorithm, numFeatures, score, sd, timeProcessed]
					# break
				scoreTable.append(scoreList)
				timeTable.append(timeList)
				sdTable.append(sdList)
			
			print 'target %d, rankingMethod %d' % (targetIdx, rankingMethod)
			print best
			collectBest.append(best)

			scoreTable = np.transpose(np.array(scoreTable)).tolist()
			timeTable = np.transpose(np.array(timeTable)).tolist()
			sdTable = np.transpose(np.array(sdTable)).tolist()

			rows.append(['score of ranking method = '+str(rankingMethod)]+algos) 
			rows.extend(scoreTable)
			rows.append(['sd of ranking method = '+str(rankingMethod)]+algos) 
			rows.extend(sdTable)
			rows.append(['time of ranking method = '+str(rankingMethod)]+algos) 
			rows.extend(timeTable)
			rows.append((1+len(algos))*[''])

		writeCSV('./Validation 2006-2012 Three Set/Indicator'+str(targetIDList[targetIdx])+'-'+time.strftime("%Y-%m-%d-%H-%M-%S")+'-'.join(algos)+'.csv',rows)
		writeCSV('./Validation 2006-2012 Three Set/CollectBest'+'-'+time.strftime("%Y-%m-%d-%H-%M-%S")+'.csv',collectBest)
Example #3
0
def testModel(testFilename, filename, targetIdx, rankingMethod, algorithm, numFeatures, gridSearch = False,cv=5):
	"""
	This function trains and tests the input model configurations with the test dataset.

	Parameters
	----------
	testFilename : string
		The name of file in iteration 2 used as testing data.

	filename : string
		The name of stacked file in iteration 2 used as training data.

	targetIdx : int
		The index of target variable in the target list.

	rankingMethod : int
		The number specified how predictors are ranked. (See the file of ranked predictor lists)

	algorithm : string (usually 3-character)
		The code of prediction algorithm such as 'RID' for ridge regression.

	numFeatures : int
		The number of features included in the model.

	gridSearch : boolean
		Perform parameter optimization or not.

		cv : int
		The number of fold for cross validation.

	Return Value
	----------
	None, but the results (error, sd) are printed.

	"""
		# filename = '2006-2013_FilteredColsTargetMissingBlank.csv'
	header = getHeader(filename)
	startTargetIndex, startPredictorIndex, numCols = getTargetAndPredictorIndex(header)
	numTargets = startPredictorIndex - startTargetIndex
	numPredictors = numCols - startPredictorIndex
	predictorHeader = header[startPredictorIndex:]
	targetHeader = header[startTargetIndex:startPredictorIndex]
	targetIDList = [int(head[0:4]) for head in targetHeader]

	dataset = np.genfromtxt(filename, delimiter=",", skip_header=1, autostrip=True, missing_values=np.nan, usecols=tuple(range(startTargetIndex,numCols)))
	# for targetIdx in range(0,numTargets):
	X = dataset[:,tuple(range(numTargets,dataset.shape[1]))]
	y = dataset[:,targetIdx]

	keepRows = np.invert(np.isnan(y))
	X = X[keepRows,:]
	y = y[keepRows]
	y = y.reshape(-1,1)

	print X.shape
	Xscaler = preprocessing.StandardScaler().fit(X)
	Xscaler.transform(X)

	Yscaler = preprocessing.StandardScaler().fit(y)
	# Yscaler.transform(y)

	# print 'Target %d => Mean %.5f , STD %.5f' % (targetIdx, Yscaler.mean_, Yscaler.scale_)
	if not gridSearch:
		estimator = getEstimator(algorithm,Yscaler,numFeatures)
	else:
		estimator = getEstimatorGridSearch(algorithm,Yscaler,numFeatures)
	if estimator is None:
		return algorithm + ': Wrong Algorithm'	
	
	XIndex = getFeaturesIndex(predictorHeader,'Feature Selection 2006-2013',targetIDList[targetIdx],rankingMethod,numFeatures)
	Xready = X[:,tuple(XIndex)]
	y = np.ravel(y)

	estimator.fit(Xready,y)
	# estimator.fit(Xready[0:int(0.75*Xready.shape[0]),:],y[0:int(0.75*Xready.shape[0])])
	print 'Best params for %s = rankingMethod %d, algo %s, numFeatures %d' % (targetHeader[targetIdx][0:4], rankingMethod, algorithm, numFeatures)
	if hasattr(estimator, 'best_params_'):
		print estimator.best_params_
	else:
		pprint(vars(estimator)) 

	#---------------------------------------Test-------------------------------------------------------------------------------------------
	testHeader = getHeader(testFilename)
	filteredCols = getSimilarColIndex(header, testHeader)
	# print filteredCols
	if numCols != len(filteredCols) + 1:
		print 'Column Error'
		return 'Column Error'
	
	testset = np.genfromtxt(testFilename, delimiter=",", skip_header=1, autostrip=True, missing_values=np.nan, usecols=tuple(filteredCols))
	XTest = testset[:,tuple(range(numTargets,testset.shape[1]))]
	yTest = testset[:,targetIdx]
	
	haveNotNull = np.sum(np.invert(np.isnan(XTest)),axis=0)
	missing = []
	lowDense = []
	for col in range(len(haveNotNull)):
		if haveNotNull[col] == 0:
			XTest[:,col] = np.array([Xscaler.mean_[col]]*XTest.shape[0])
			missing.append(col)
		if haveNotNull[col] < 0.4*XTest.shape[0]:
			lowDense.append(col)
	print 'Low dense = %d' % (len(lowDense))

	imp = Imputer(missing_values='NaN', strategy='median', axis=0)
	XTest = imp.fit_transform(XTest)

	keepRows = np.invert(np.isnan(yTest))
	XTest = XTest[keepRows,:]
	yTest = yTest[keepRows]
	yTest = yTest.reshape(-1,1)

	Xscaler.transform(XTest)
	XTestReady = XTest[:,tuple(XIndex)]

	# predicted = estimator.predict(XTestReady)
	predicted = bootstrappingPrediction(estimator, Xready, y, XTestReady, B = 200, parametric = False)
	absolute_error = np.absolute(yTest - predicted)
	# predicted = estimator.predict(Xready[int(0.75*Xready.shape[0]):,:])
	# absolute_error = np.absolute(y[int(0.75*Xready.shape[0]):] - predicted)

	score_mean = absolute_error.mean()
	score_sd = absolute_error.std()
	
	missingCount = len(list(set(missing) & set(XIndex)))
	lowDenseCount = len(list(set(lowDense) & set(XIndex)))
	# print score_mean, score_sd
	print '%s = rankingMethod %d, algo %s, numFeatures %d, score(mean, sd) = (%f,%f), missing = %d, lowDense = %d' % (targetHeader[targetIdx][0:4], rankingMethod, algorithm, numFeatures, score_mean, score_sd, missingCount, lowDenseCount)
Example #4
0
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
import matplotlib.pyplot as plt

data = dI.dataImporter(shuffle=True, stratify=True)
X_train, y_train = data.getTrainData()
#X_train = X_train[:,:-3]
# define 10-fold cross validation test harness
#kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=3)
#multiNB = make_pipeline(MultinomialNB())

#multinomial needs correct probability!!!

clf = make_pipeline(MultinomialNB())
clf2 = make_pipeline(preprocessing.StandardScaler(), BernoulliNB())
clf3 = make_pipeline(preprocessing.StandardScaler(), GaussianNB())
scores = cross_val_score(clf, X_train[:, :-3], y_train, cv=10)
scores2 = cross_val_score(clf2, X_train, y_train, cv=10)
scores3 = cross_val_score(clf2, X_train, y_train, cv=10)

print("Mean ROC_AUC Multi: %.2f%% (+/- %.2f%%)" %
      (scores.mean() * 100, scores.std() * 100))
print("Mean ROC_AUC Bernu: %.2f%% (+/- %.2f%%)" %
      (scores2.mean() * 100, scores2.std() * 100))
print("Mean ROC_AUC Gauss: %.2f%% (+/- %.2f%%)" %
      (scores3.mean() * 100, scores3.std() * 100))

# from sklearn.metrics import roc_curve
# X_test, y_test = data.getTestData()
# probas_ = clf.fit(X_train, y_train).predict_proba(X_test)
Example #5
0
#[ 5.1  3.5  1.4  0.2] 0

from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
# Get dataset with only the first two attributes
X, y = X_iris[:, :2], y_iris
# Split the dataset into a training and a testing set
# Test set will be the 25% taken randomly
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=33)
print(X_train.shape, y_train.shape)
#(112, 2) (112,)
# Standardize the features
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

import matplotlib.pyplot as plt
colors = ['red', 'greenyellow', 'blue']
#it doesnt like "xrange" changed to "range"
for i in range(len(colors)):
    xs = X_train[:, 0][y_train == i]
    ys = X_train[:, 1][y_train == i]
    plt.scatter(xs, ys, c=colors[i])
plt.legend(iris.target_names)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')

#found a typo here... incorrect from book followed by corrected code
Example #6
0
def sort_features(features, outdir, name, header=None, verbose=0):
    """ Sort a feature array using a Ward hierachical clustering analysis on
    the rows and the columns.

    Parameters
    ----------
    features: array (N, M)
        an array of features to be sorted.
    outdir: str
        the destination folder where the ouputs will be saved.
    name: str
        the name of the plot.
    header: list (M, ), default None
        the features names.
    verbose: int, default 0
        the verbosity level.

    Returns
    -------
    features_snap: str
        the sorted features representation.
    """
    # Check inputs
    if verbose > 0:
        print("[info] Sorting features in array of shape "
              "'{0}'...".format(features.shape))

    # Normalize features
    scaler = preprocessing.StandardScaler().fit(features)
    scaled_features = scaler.transform(features)

    # Use the seaborn template to create a pretty display
    with plt.style.context("seaborn-deep"):

        # Create a figure
        fig = plt.figure(figsize=(12, 12))
        ax1 = fig.add_axes([0.17, 0.15, 0.1, 0.73])
        ax2 = fig.add_axes([0.3, 0.89, 0.6, 0.1])
        ax2.set_xticks([])
        ax2.set_yticks([])
        ax1.xaxis.set_visible(False)
        ax1.yaxis.set_visible(False)
        ax2.xaxis.set_visible(False)
        ax2.yaxis.set_visible(False)

        # Ward hierachical clustering analysis on the rows and the columns.
        linkage_array_row = scipy.cluster.hierarchy.linkage(scaled_features,
                                                            method="ward",
                                                            metric="euclidean")
        dendogram_1 = scipy.cluster.hierarchy.dendrogram(linkage_array_row,
                                                         orientation="left",
                                                         ax=ax1)
        linkage_array_col = scipy.cluster.hierarchy.linkage(
            scaled_features.transpose(), method="ward", metric="euclidean")
        dendogram_2 = scipy.cluster.hierarchy.dendrogram(linkage_array_col,
                                                         ax=ax2)

        # Organize the input feature array
        idx1 = dendogram_1["leaves"]
        idx2 = dendogram_2["leaves"]
        axmatrix = fig.add_axes([0.3, 0.15, 0.6, 0.73])
        matrix = scaled_features[:, idx2]
        matrix = matrix[idx1, :]

        # Render the organized feature matrix
        im = axmatrix.matshow(matrix,
                              aspect="auto",
                              origin="lower",
                              cmap=plt.cm.get_cmap("Spectral"),
                              vmin=-1,
                              vmax=1)
        if header is not None:
            clusterized_labels = [header[i] for i in idx2]
            axmatrix.xaxis.set_visible(True)
            axmatrix.xaxis.set_label_position("bottom")
            axmatrix.xaxis.tick_bottom()
            axmatrix.set_xticks(range(len(header)))
            axmatrix.set_xticklabels(clusterized_labels,
                                     fontsize=8,
                                     rotation=-90)
        else:
            axmatrix.xaxis.set_visible(False)
        axmatrix.yaxis.set_visible(False)
        axcolor = fig.add_axes([0.91, 0.15, 0.02, 0.73])
        plt.colorbar(im, cax=axcolor)
        plt.title("Organized features", fontsize=10)

        # Display/save the plot
        features_snap = os.path.join(outdir, name + ".png")
        plt.savefig(features_snap, format="png")

    return features_snap
Example #7
0
            pca = PCA(n_components=i + 1, svd_solver='auto')
            pca_data = pca.fit_transform(data)
            return pca, pca_data


if __name__ == '__main__':
    abbrTrain = 'E:\python_project\happinessPredict\DataSet\happiness_train_abbr.csv'
    abbrTest = 'E:\python_project\happinessPredict\DataSet\happiness_test_abbr.csv'
    # trainData, happiness = readData.readData(abbrTrain, True)
    # # 对每一行的样本的同一位置的特征进行z-score标准化
    # trainData = preprocessing.scale(trainData)
    # print(trainData.mean(axis=0))
    # print(trainData.std(axis=0))

    trainData, happiness = readData.readData(abbrTrain, True)
    scaler = preprocessing.StandardScaler().fit(trainData)
    trainData = scaler.transform(trainData)

    pca, pca_data = get_x_ratio(trainData, 0.95)

    testData = readData.readData(abbrTest, False)
    testData = scaler.transform(testData)

    # 绘制主成分直方图
    ratio = pca.explained_variance_ratio_.tolist()
    plt.figure()
    plt.grid()
    plt.bar(range(len(ratio)),
            ratio,
            alpha=0.9,
            facecolor="lightskyblue",
#making a model to predict the quality of wine
#splitting the data on quality
y = data.quality
x = data.drop('quality', axis=1)

#splitting the data into training and test with 20% as test data
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=123,
                                                    stratify=y)

#scaling the data using preprocessing library
#without it we would minus the mean and divide by the standard deviation
#using pipeline to make a pipeline between the model and the standard Scaler
pipeline = make_pipeline(preprocessing.StandardScaler(),
                         RandomForestRegressor(n_estimators=100))
#setting the hyperparameters
hyperparameters = {
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
    'randomforestregressor__max_depth': [None, 5, 3, 1]
}
#using cross validation to select the right parameters. Using GraphCV searches and fits the proper data
clf = GridSearchCV(pipeline, hyperparameters,
                   cv=10)  #1 hold out and 9 train, iterated over 10 times
#training to avoid overfitting and choosing optimal parameter
clf.fit(x_train, y_train)
#clf.best_params_  #displays the optimal chosen parameters

#using the model to predict the test data
y_pred = clf.predict(x_test)
def run_models(settings=None):
    analysis_scr = []
    with_auc_score = settings['with_auc_score']

    for subset_no in xrange(1, settings['number_iterations'] + 1):
        print("Subset:", subset_no)

        ################## generate data ###################
        array_A = []
        array_B = []
        for i in range(100000):
            array_A.append(np.random.random_integers(0, 59999))
            array_B.append(np.random.random_integers(0, 59999))
        pos_index = []
        neg_index = []
        for index in xrange(100000):
            if y_total[array_A[index]] - y_total[array_B[index]] == 1:
                pos_index.append(index)
            else:
                neg_index.append(index)
        print 'number of positive examples', len(pos_index)
        selected_neg_index = neg_index[:len(pos_index)]

        array_A = np.array(array_A)
        array_B = np.array(array_B)
        index_for_positive_image_A = array_A[pos_index]
        index_for_positive_image_B = array_B[pos_index]
        index_for_neg_image_A = array_A[selected_neg_index]
        index_for_neg_image_B = array_B[selected_neg_index]

        X_pos_A = X_total[index_for_positive_image_A]
        X_pos_B = X_total[index_for_positive_image_B]
        X_pos_whole = np.hstack((X_pos_A, X_pos_B))
        X_neg_A = X_total[index_for_neg_image_A]
        X_neg_B = X_total[index_for_neg_image_B]
        X_neg_whole = np.hstack((X_neg_A, X_neg_B))
        print X_pos_A.shape, X_pos_B.shape, X_pos_whole.shape
        print X_neg_A.shape, X_neg_B.shape, X_neg_whole.shape

        X_whole = np.vstack((X_pos_whole, X_neg_whole))
        print X_whole.shape
        y_pos = np.ones(X_pos_whole.shape[0])
        y_neg = np.zeros(X_neg_whole.shape[0])
        y_whole = np.concatenate([y_pos, y_neg])
        print y_whole.shape

        x_train_pre_validation, x_test, y_train_pre_validation, y_test = train_test_split(
            X_whole, y_whole, test_size=0.2, random_state=211)
        for number_of_training in settings['number_of_training']:

            x_train, x_validation, y_train, y_validation = train_test_split(x_train_pre_validation[:number_of_training],
                                                                                                        y_train_pre_validation[:number_of_training],\
                                                                        test_size=0.2, random_state=21)
            '''
            x_train, x_validation, y_train, y_validation = train_test_split(x_train_pre_validation[:],
                                                                                                        y_train_pre_validation[:],\
                                                                        test_size=0.4, random_state=21)
            '''
            print x_train.shape, y_train.shape, x_validation.shape, \
            y_validation.shape, x_test.shape, y_test.shape
            x_train_minmax, x_validation_minmax, x_test_minmax = x_train, x_validation, x_test
            train_X_reduced = x_train
            train_y_reduced = y_train
            test_X = x_test
            test_y = y_test
            ###original data###
            ################ end of data ####################
            standard_scaler = preprocessing.StandardScaler().fit(
                train_X_reduced)
            scaled_train_X = standard_scaler.transform(train_X_reduced)
            scaled_test_X = standard_scaler.transform(test_X)
            if settings['SVM']:
                print "SVM"
                Linear_SVC = LinearSVC(C=1, penalty="l2")
                Linear_SVC.fit(scaled_train_X, y_train)
                predicted_test_y = Linear_SVC.predict(scaled_test_X)
                isTest = True
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'SVM', isTest) + tuple(
                        performance_score(test_y,
                                          predicted_test_y).values()))  #new

                predicted_train_y = Linear_SVC.predict(scaled_train_X)
                isTest = False
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'SVM', isTest) + tuple(
                        performance_score(train_y_reduced,
                                          predicted_train_y).values()))

            if settings['SVM_RBF']:
                print "SVM_RBF"
                L1_SVC_RBF_Selector = SVC(C=1, gamma=0.01, kernel='rbf').fit(
                    scaled_train_X, y_train)
                predicted_test_y = L1_SVC_RBF_Selector.predict(scaled_test_X)
                isTest = True
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'SVM_RBF', isTest) + tuple(
                        performance_score(test_y,
                                          predicted_test_y).values()))  #new
                predicted_train_y = L1_SVC_RBF_Selector.predict(scaled_train_X)
                isTest = False
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'SVM_RBF', isTest) + tuple(
                        performance_score(train_y_reduced,
                                          predicted_train_y).values()))

            if settings['SVM_POLY']:
                print "SVM_POLY"
                L1_SVC_POLY_Selector = SVC(C=1, kernel='poly').fit(
                    scaled_train_X, train_y_reduced)

                predicted_test_y = L1_SVC_POLY_Selector.predict(scaled_test_X)
                isTest = True
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'SVM_POLY', isTest) +
                    tuple(
                        performance_score(test_y,
                                          predicted_test_y).values()))  #new

                predicted_train_y = L1_SVC_POLY_Selector.predict(
                    scaled_train_X)
                isTest = False
                #new
                analysis_scr.append((
                    subset_no, number_of_training, 'SVM_POLY', isTest) + tuple(
                        performance_score(train_y_reduced,
                                          predicted_train_y).values()))

            if settings['Log']:
                print "Log"
                log_clf_l2 = sklearn.linear_model.LogisticRegression(
                    C=1, penalty='l2')
                log_clf_l2.fit(scaled_train_X, train_y_reduced)
                predicted_test_y = log_clf_l2.predict(scaled_test_X)
                isTest = True
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'Log', isTest) + tuple(
                        performance_score(test_y,
                                          predicted_test_y).values()))  #new
                predicted_train_y = log_clf_l2.predict(scaled_train_X)
                isTest = False
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'Log', isTest) + tuple(
                        performance_score(train_y_reduced,
                                          predicted_train_y).values()))

            # direct deep learning

            finetune_lr = settings['finetune_lr']
            batch_size = settings['batch_size']
            pretraining_epochs = cal_epochs(
                settings['pretraining_interations'],
                x_train_minmax,
                batch_size=batch_size)
            #pretrain_lr=0.001
            pretrain_lr = settings['pretrain_lr']
            training_epochs = cal_epochs(settings['training_interations'],
                                         x_train_minmax,
                                         batch_size=batch_size)
            hidden_layers_sizes = settings['hidden_layers_sizes']
            corruption_levels = settings['corruption_levels']

            if settings['DL']:
                print "direct deep learning"
                sda = trainSda(x_train_minmax, y_train,
                             x_validation_minmax, y_validation,
                             x_test_minmax, test_y,
                             hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                             training_epochs = training_epochs, pretraining_epochs = pretraining_epochs,
                             pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                 )
                print 'hidden_layers_sizes:', hidden_layers_sizes
                print 'corruption_levels:', corruption_levels
                test_predicted = sda.predict(x_test_minmax)
                isTest = True
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'DL', isTest) +
                    tuple(performance_score(y_test, test_predicted).values()))
                training_predicted = sda.predict(x_train_minmax)
                isTest = False
                #new
                analysis_scr.append((
                    subset_no, number_of_training, 'DL', isTest
                ) + tuple(
                    performance_score(y_train, training_predicted).values()))

            ####transformed original data####
            x = train_X_reduced
            a_MAE_original = train_a_MultipleAEs(
                x,
                pretraining_epochs=pretraining_epochs,
                pretrain_lr=pretrain_lr,
                batch_size=batch_size,
                hidden_layers_sizes=hidden_layers_sizes,
                corruption_levels=corruption_levels)
            new_x_train_minmax_A = a_MAE_original.transform(train_X_reduced)
            new_x_test_minmax_A = a_MAE_original.transform(x_test_minmax)
            standard_scaler = preprocessing.StandardScaler().fit(
                new_x_train_minmax_A)
            new_x_train_scaled = standard_scaler.transform(
                new_x_train_minmax_A)
            new_x_test_scaled = standard_scaler.transform(new_x_test_minmax_A)
            new_x_train_combo = np.hstack((scaled_train_X, new_x_train_scaled))
            new_x_test_combo = np.hstack((scaled_test_X, new_x_test_scaled))

            if settings['SAE_SVM']:
                # SAE_SVM
                print 'SAE followed by SVM'

                Linear_SVC = LinearSVC(C=1, penalty="l2")
                Linear_SVC.fit(new_x_train_scaled, train_y_reduced)
                predicted_test_y = Linear_SVC.predict(new_x_test_scaled)
                isTest = True
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'SAE_SVM', isTest) + tuple(
                        performance_score(test_y,
                                          predicted_test_y).values()))  #new

                predicted_train_y = Linear_SVC.predict(new_x_train_scaled)
                isTest = False
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'SAE_SVM', isTest) + tuple(
                        performance_score(train_y_reduced,
                                          predicted_train_y).values()))
            if settings['SAE_Log']:
                print 'SAE followed by Log'
                log_clf_l2 = sklearn.linear_model.LogisticRegression(
                    C=1, penalty='l2')
                log_clf_l2.fit(new_x_train_scaled, train_y_reduced)
                predicted_test_y = log_clf_l2.predict(new_x_test_scaled)
                isTest = True
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'SAE_Log', isTest) + tuple(
                        performance_score(test_y,
                                          predicted_test_y).values()))  #new
                predicted_train_y = log_clf_l2.predict(new_x_train_scaled)
                isTest = False
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'SAE_Log', isTest) + tuple(
                        performance_score(train_y_reduced,
                                          predicted_train_y).values()))

            if settings['SAE_SVM_RBF']:
                # SAE_SVM
                print 'SAE followed by SVM RBF'
                L1_SVC_RBF_Selector = SVC(C=1, gamma=0.01, kernel='rbf').fit(
                    new_x_train_scaled, train_y_reduced)

                predicted_test_y = L1_SVC_RBF_Selector.predict(
                    new_x_test_scaled)
                isTest = True
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'SAE_SVM_RBF', isTest) +
                    tuple(
                        performance_score(test_y,
                                          predicted_test_y).values()))  #new

                predicted_train_y = L1_SVC_RBF_Selector.predict(
                    new_x_train_scaled)
                isTest = False
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'SAE_SVM_RBF', isTest) +
                    tuple(
                        performance_score(train_y_reduced,
                                          predicted_train_y).values()))
            if settings['SAE_SVM_POLY']:
                # SAE_SVM
                print 'SAE followed by SVM POLY'
                L1_SVC_RBF_Selector = SVC(C=1, kernel='poly').fit(
                    new_x_train_scaled, train_y_reduced)

                predicted_test_y = L1_SVC_RBF_Selector.predict(
                    new_x_test_scaled)
                isTest = True
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'SAE_SVM_POLY', isTest) +
                    tuple(
                        performance_score(test_y,
                                          predicted_test_y).values()))  #new

                predicted_train_y = L1_SVC_RBF_Selector.predict(
                    new_x_train_scaled)
                isTest = False
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'SAE_SVM_POLY', isTest) +
                    tuple(
                        performance_score(train_y_reduced,
                                          predicted_train_y).values()))

            #### separated transformed data ####
            y_test = test_y
            print 'deep learning using split network'
            # get the new representation for A set. first 784-D
            pretraining_epochs = cal_epochs(
                settings['pretraining_interations'],
                x_train_minmax,
                batch_size=batch_size)

            x = x_train_minmax[:, :x_train_minmax.shape[1] / 2]
            print "original shape for A", x.shape
            a_MAE_A = train_a_MultipleAEs(
                x,
                pretraining_epochs=pretraining_epochs,
                pretrain_lr=pretrain_lr,
                batch_size=batch_size,
                hidden_layers_sizes=[x / 2 for x in hidden_layers_sizes],
                corruption_levels=corruption_levels)
            new_x_train_minmax_A = a_MAE_A.transform(
                x_train_minmax[:, :x_train_minmax.shape[1] / 2])
            x = x_train_minmax[:, x_train_minmax.shape[1] / 2:]

            print "original shape for B", x.shape
            a_MAE_B = train_a_MultipleAEs(
                x,
                pretraining_epochs=pretraining_epochs,
                pretrain_lr=pretrain_lr,
                batch_size=batch_size,
                hidden_layers_sizes=[x / 2 for x in hidden_layers_sizes],
                corruption_levels=corruption_levels)
            new_x_train_minmax_B = a_MAE_B.transform(
                x_train_minmax[:, x_train_minmax.shape[1] / 2:])

            new_x_test_minmax_A = a_MAE_A.transform(
                x_test_minmax[:, :x_test_minmax.shape[1] / 2])
            new_x_test_minmax_B = a_MAE_B.transform(
                x_test_minmax[:, x_test_minmax.shape[1] / 2:])
            new_x_validation_minmax_A = a_MAE_A.transform(
                x_validation_minmax[:, :x_validation_minmax.shape[1] / 2])
            new_x_validation_minmax_B = a_MAE_B.transform(
                x_validation_minmax[:, x_validation_minmax.shape[1] / 2:])
            new_x_train_minmax_whole = np.hstack(
                (new_x_train_minmax_A, new_x_train_minmax_B))
            new_x_test_minmax_whole = np.hstack(
                (new_x_test_minmax_A, new_x_test_minmax_B))
            new_x_validationt_minmax_whole = np.hstack(
                (new_x_validation_minmax_A, new_x_validation_minmax_B))
            standard_scaler = preprocessing.StandardScaler().fit(
                new_x_train_minmax_whole)
            new_x_train_minmax_whole_scaled = standard_scaler.transform(
                new_x_train_minmax_whole)
            new_x_test_minmax_whole_scaled = standard_scaler.transform(
                new_x_test_minmax_whole)
            if settings['DL_S']:
                # deep learning using split network
                sda_transformed = trainSda(new_x_train_minmax_whole, y_train,
                     new_x_validationt_minmax_whole, y_validation ,
                     new_x_test_minmax_whole, y_test,
                     hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                     training_epochs = training_epochs, pretraining_epochs = pretraining_epochs,
                     pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                     )
                print 'hidden_layers_sizes:', hidden_layers_sizes
                print 'corruption_levels:', corruption_levels

                predicted_test_y = sda_transformed.predict(
                    new_x_test_minmax_whole)
                y_test = test_y
                isTest = True
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'DL_S', isTest) + tuple(
                        performance_score(y_test, predicted_test_y,
                                          with_auc_score).values()))

                training_predicted = sda_transformed.predict(
                    new_x_train_minmax_whole)
                isTest = False
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'DL_S', isTest) + tuple(
                        performance_score(y_train, training_predicted,
                                          with_auc_score).values()))
            if settings['SAE_S_SVM']:
                print 'SAE_S followed by SVM'

                Linear_SVC = LinearSVC(C=1, penalty="l2")
                Linear_SVC.fit(new_x_train_minmax_whole_scaled,
                               train_y_reduced)
                predicted_test_y = Linear_SVC.predict(
                    new_x_test_minmax_whole_scaled)
                isTest = True
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'SAE_S_SVM', isTest) +
                    tuple(
                        performance_score(test_y, predicted_test_y,
                                          with_auc_score).values()))  #new

                predicted_train_y = Linear_SVC.predict(
                    new_x_train_minmax_whole_scaled)
                isTest = False
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'SAE_S_SVM', isTest) +
                    tuple(
                        performance_score(train_y_reduced, predicted_train_y,
                                          with_auc_score).values()))
            if settings['SAE_S_SVM_RBF']:
                print 'SAE S followed by SVM RBF'
                L1_SVC_RBF_Selector = SVC(C=1, gamma=0.01, kernel='rbf').fit(
                    new_x_train_minmax_whole_scaled, train_y_reduced)

                predicted_test_y = L1_SVC_RBF_Selector.predict(
                    new_x_test_minmax_whole_scaled)
                isTest = True
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'SAE_S_SVM_RBF', isTest) +
                    tuple(
                        performance_score(test_y, predicted_test_y,
                                          with_auc_score).values()))  #new

                predicted_train_y = L1_SVC_RBF_Selector.predict(
                    new_x_train_minmax_whole_scaled)
                isTest = False
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'SAE_S_SVM_RBF', isTest) +
                    tuple(
                        performance_score(train_y_reduced, predicted_train_y,
                                          with_auc_score).values()))
            if settings['SAE_S_SVM_POLY']:
                # SAE_SVM
                print 'SAE S followed by SVM POLY'
                L1_SVC_RBF_Selector = SVC(C=1, kernel='poly').fit(
                    new_x_train_minmax_whole_scaled, train_y_reduced)

                predicted_test_y = L1_SVC_RBF_Selector.predict(
                    new_x_test_minmax_whole_scaled)
                isTest = True
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'SAE_S_SVM_POLY', isTest) +
                    tuple(
                        performance_score(test_y, predicted_test_y,
                                          with_auc_score).values()))  #new

                predicted_train_y = L1_SVC_RBF_Selector.predict(
                    new_x_train_minmax_whole_scaled)
                isTest = False
                #new
                analysis_scr.append(
                    (subset_no, number_of_training, 'SAE_S_SVM_POLY', isTest) +
                    tuple(
                        performance_score(train_y_reduced, predicted_train_y,
                                          with_auc_score).values()))

        report_name = 'DL_handwritten_digits' + '_size_'.join(map(str, hidden_layers_sizes)) + \
                        '_' + str(pretrain_lr) + '_' + str(finetune_lr) + '_' + \
                '_' + str(settings['pretraining_interations']) + '_' + current_date
    saveAsCsv(with_auc_score, report_name,
              performance_score(test_y, predicted_test_y, with_auc_score),
              analysis_scr)
    return sda, a_MAE_original, a_MAE_A, a_MAE_B, analysis_scr
Example #10
0
 def __init__(self, X, Y):
     self.scaler = preprocessing.StandardScaler().fit(Y)
Example #11
0
 def __init__(self, X, Y):
     self.input_scaler = preprocessing.StandardScaler().fit(X)
     self.output_scaler = preprocessing.StandardScaler().fit(Y)
Example #12
0
                  barmode="group",
                  template="plotly_white",
                  labels={
                      "var": "Variable",
                      "value": "Value",
                      "variable": "Statistic"
                  },
                  color_discrete_sequence=px.colors.qualitative.Safe,
                  log_y=True)
st.write(feat_fig)
st.write(
    "Wow, that's quite a significant discrepancy - let's scale these to a mean of zero and a standard deviation of 1"
)

# Scale the features
scaler = preprocessing.StandardScaler().fit(cont_df)
X = scaler.transform(cont_df)
# Prove that mean = 0, st deviation = 1
feat_desc = pd.DataFrame(X).describe().transpose().reset_index().rename(
    {'index': "var"}, axis=1)
feat_fig = px.bar(feat_desc[['var', 'mean', 'std']].melt(id_vars=['var']),
                  x="var",
                  y="value",
                  color="variable",
                  barmode="group",
                  template="plotly_white",
                  labels={
                      "var": "Variable",
                      "value": "Value",
                      "variable": "Statistic"
                  },
Example #13
0
    zdm = random.choice(ls_z_dim)
    lre = random.choice(ls_lr)
    epch = random.choice(ls_epoch)

    for train_index, test_index in skf.split(GDSCE.values, Y):
        k = k + 1
        X_trainE = GDSCE.values[train_index, :]
        X_testE = GDSCE.values[test_index, :]
        X_trainM = GDSCM.values[train_index, :]
        X_testM = GDSCM.values[test_index, :]
        X_trainC = GDSCC.values[train_index, :]
        X_testC = GDSCM.values[test_index, :]
        y_trainE = Y[train_index]
        y_testE = Y[test_index]

        scalerGDSC = sk.StandardScaler()
        scalerGDSC.fit(X_trainE)
        X_trainE = scalerGDSC.transform(X_trainE)
        X_testE = scalerGDSC.transform(X_testE)

        X_trainM = np.nan_to_num(X_trainM)
        X_trainC = np.nan_to_num(X_trainC)
        X_testM = np.nan_to_num(X_testM)
        X_testC = np.nan_to_num(X_testC)

        TX_testE = torch.FloatTensor(X_testE)
        TX_testM = torch.FloatTensor(X_testM)
        TX_testC = torch.FloatTensor(X_testC)
        ty_testE = torch.FloatTensor(y_testE.astype(int))

        #Train
import joblib
import pandas as pd
import numpy as np
dat = pd.read_csv("trial.csv")
dat.head(3)
#Missing value check
count_nan = dat.isnull().sum()
count_nan
#Fill in missing values
dat['Money_Value'].fillna(dat['Money_Value'].mean(), inplace=True)
dat.info()
#Divide X and Y
X, Y = dat.iloc[0:, 0:9], dat.iloc[0:, 9:]
#standardization
from sklearn import preprocessing
zscore = preprocessing.StandardScaler()
X = zscore.fit_transform(X)
#spearman Correlation coefficient for feature selection
dfs = dat.corr('spearman')  #Calculate spearman correlation coefficient
print(dfs)

dfs["Ranking"] = dfs["Risk"].rank(method="first")
display(dfs)  #Print all spearman coefficient values

dfs['sort_helper'] = dfs['Risk'].abs()
dfs["absRanking"] = dfs["sort_helper"].rank(method="first")
display(dfs["absRanking"])  #Ascending order

#RFE for feature selection
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
def select_scaler():
    scaler = preprocessing.StandardScaler(with_mean=False)
    return scaler
Example #16
0
    def read_matdataset(self, opt):
        tic = time.time()
        src = "NUS-WIDE"  #path contsining features
        att_path = os.path.join(
            src, 'word_embedding',
            'NUS_WIDE_pretrained_w2v_glove-wiki-gigaword-300')
        file_tag1k = os.path.join(src, 'NUS_WID_Tags', 'TagList1k.txt')
        file_tag81 = os.path.join(src, 'ConceptsList', 'Concepts81.txt')
        self.seen_cls_idx, _ = get_seen_unseen_classes(file_tag1k, file_tag81)
        src_att = pickle.load(open(att_path, 'rb'))
        print("attributes are combined in this order-> seen+unseen")
        self.attribute = torch.from_numpy(
            normalize(
                np.concatenate((src_att[0][self.seen_cls_idx], src_att[1]),
                               axis=0)))
        #VGG features path
        train_loc = util.load_dict_from_hdf5(
            os.path.join(src, 'nus_wide_paper_features',
                         'nus_seen_train_vgg19.h5'))
        test_unseen_loc = util.load_dict_from_hdf5(
            os.path.join(src, 'nus_wide_paper_features',
                         'nus_zsl_test_vgg19.h5'))
        test_seen_unseen_loc = util.load_dict_from_hdf5(
            os.path.join(src, 'nus_wide_paper_features',
                         'nus_gzsl_test_vgg19.h5'))

        feature_train_loc = train_loc['features']
        label_train_loc = train_loc['labels']
        feature_test_unseen_loc = test_unseen_loc['features']
        label_test_unseen_loc = test_unseen_loc['labels']
        feature_test_seen_unseen_loc = test_seen_unseen_loc['features']
        label_test_seen_unseen_loc = test_seen_unseen_loc['labels']
        print("Data loading finished, Time taken: {}".format(time.time() -
                                                             tic))

        tic = time.time()
        if not opt.validation:
            if opt.preprocessing:
                if opt.standardization:
                    print('standardization...')
                    scaler = preprocessing.StandardScaler()
                else:
                    scaler = preprocessing.MinMaxScaler()

                _train_feature = scaler.fit_transform(feature_train_loc)
                _test_unseen_feature = scaler.transform(
                    feature_test_unseen_loc)
                _test_seen_unseen_feature = scaler.transform(
                    feature_test_seen_unseen_loc)

                self.train_feature = torch.from_numpy(_train_feature).float()
                mx = self.train_feature.max()
                self.train_feature.mul_(1 / mx)
                self.train_label = torch.from_numpy(label_train_loc).long()

                self.test_unseen_feature = torch.from_numpy(
                    _test_unseen_feature).float()
                self.test_unseen_feature.mul_(1 / mx)
                self.test_unseen_label = torch.from_numpy(
                    label_test_unseen_loc).long()

                self.test_seen_unseen_feature = torch.from_numpy(
                    _test_seen_unseen_feature).float()
                self.test_seen_unseen_feature.mul_(1 / mx)
                self.test_seen_unseen_label = torch.from_numpy(
                    label_test_seen_unseen_loc).long()
            else:
                self.train_feature = torch.from_numpy(
                    feature_train_loc).float()
                self.train_label = torch.from_numpy(label_train_loc).long()
                self.test_unseen_feature = torch.from_numpy(
                    feature_test_unseen_loc).float()
                self.test_unseen_label = torch.from_numpy(
                    label_test_unseen_loc).long()

        print("REMOVING ZEROS LABELS")
        temp_label = torch.clamp(self.train_label, 0, 1)
        temp_seen_labels = temp_label.sum(1)
        temp_label = temp_label[temp_seen_labels > 0]

        self.train_label = temp_label
        self.train_feature = self.train_feature[temp_seen_labels > 0]

        self.train_trimmed_label = self.train_label[temp_label.sum(1) <= opt.N]
        self.train_trimmed_feature = self.train_feature[
            temp_label.sum(1) <= opt.N]

        print("Data with N={} labels={}".format(
            opt.N, self.train_trimmed_label.shape))
        print(
            "Full Data labels={} with min label/feature = {} and max label/feature = {}"
            .format(self.train_label.shape,
                    temp_label.sum(1).min(),
                    temp_label.sum(1).max()))

        self.seenclasses = torch.from_numpy(
            np.arange(0, self.seen_cls_idx.shape[-1]))  # [0-925]
        self.unseenclasses = torch.from_numpy(
            np.arange(0 + self.seen_cls_idx.shape[-1],
                      len(self.attribute)))  # [925-1006]

        self.N = opt.N
        self.syn_num = opt.syn_num
        self.per_seen = opt.per_seen
        self.per_unseen = opt.per_unseen
        self.per_seen_unseen = opt.per_seen_unseen

        print("USING TRAIN FEATURES WITH <=N")
        self.ntrain = self.train_trimmed_feature.size()[0]
        train_labels = self.train_trimmed_label

        self.ntest_unseen = self.test_unseen_feature.size()[0]
        self.ntrain_class = self.seenclasses.size(0)
        self.ntest_class = self.unseenclasses.size(0)
        self.train_class = self.seenclasses.clone()
        self.allclasses = torch.arange(0, self.ntrain_class +
                                       self.ntest_class).long()
        self.GZSL_fake_test_labels = generate_fake_test_from_train_labels(train_labels, self.attribute, self.seenclasses, \
                                        self.unseenclasses, self.syn_num, self.per_seen, self.per_unseen, self.per_seen_unseen)

        print("Data preprocssing finished, Time taken: {}".format(time.time() -
                                                                  tic))
# In[69]:


train.columns


# In[70]:


cols_for_ss = ['perc_premium_paid_by_cash_credit', 'age_in_years', 'Income',
       'Count_3-6_months_late', 'Count_6-12_months_late',
       'Count_more_than_12_months_late', 'application_underwriting_score',
       'no_of_premiums_paid', 'sourcing_channel', 'residence_area_type']

scaler = preprocessing.StandardScaler().fit(train[cols_for_ss])
train[cols_for_ss] = scaler.transform(train[cols_for_ss])
test[cols_for_ss] = scaler.transform(test[cols_for_ss])
print(scaler.mean_)


# In[71]:


train.head()


# # Build Training and Testing Model

# In[72]:
Features = loan_df[['Principal', 'terms', 'age', 'Gender', 'weekend']]
Features = pd.concat([Features, pd.get_dummies(loan_df['education'])], axis=1)
Features.drop(['Master or Above'], axis=1,
              inplace=True)  # Percentage of population is too small
Features.head()

# Feature Sets: X
X = Features
X[0:5]

# Label: y
y = loan_df['loan_status'].values
y[0:5]

# Normalize Data - Equilize the range and data variabliity.  This reduces bias from feature size difference
X = preprocessing.StandardScaler().fit(X).transform(X.astype(float))
X[0:5]
'''************************************'''
''' CLASSIFICATION MODELING   '''
'''*************************************'''
# K Nearest Neighbor (KNN)
# Decision Tree
# Support Vector Machine (SVM)
# Logistic Regression

print(loan_df.dtypes)
'''************************************'''
''' KNN - K-Nearest Neighbors  '''
'''*************************************'''

from sklearn.model_selection import train_test_split
Example #19
0
    p_traj_clean_df['id'] += end_id
    end_id = p_traj_clean_df['id'].values[-1]
    compute_x2(p_traj_clean_df)
    compute_x2(hits, prefix='')
    p_traj_clean_all.append(p_traj_clean_df)

    xyz = hits.loc[:, ['x', 'y', 'z']].values.transpose()
    rtp = cart2spherical(xyz).transpose()
    rtp_df = pd.DataFrame(rtp, columns=('r', 'theta', 'phi'))
    hits = pd.concat((hits, rtp_df), axis=1)

    hits_all.append(hits)
    if i > -1:
        break

scl = preprocessing.StandardScaler()
# clf = LinearDiscriminantAnalysis(n_components=None)
clf = LFDA(k=2)
# clf = NCA()
# clf = LMNN(k=2)
# clf = RCA_Supervised()
X_cols = ('r', 'theta', 'phi', 'x', 'y', 'z', 'x2', 'y2', 'z2')
tX_cols = ('r', 'theta', 'phi', 'tx', 'ty', 'tz', 'tx2', 'ty2', 'tz2')
# X_cols = ('x2', 'y2', 'z2')
# tX_cols = ('tx2', 'ty2', 'tz2')

p_traj_clean_cat = pd.concat(p_traj_clean_all, ignore_index=True)
hits_cat = pd.concat(hits_all, ignore_index=True)

X_scale = scl.fit_transform(hits_cat.loc[:, X_cols].values)
X_clean_scale = scl.transform(p_traj_clean_cat.loc[:, tX_cols].values)
#implement scikit-learn train_test_split function
#random_state is a seed file number so we can reproduce results
X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=123,
                                                    stratify=y)

#stratify--make sure training set looks similar to test set to make eval more reliable
#now standardize (subtract mean and divide difference by SD)
#scikit-learn simple scalin = X_train_scaled = preprocessing.scale(X_train)
#only works on training set not test set
#transformer api in scikit allows for fitting current and future data sets

#fit transformer to training set
scaler = preprocessing.StandardScaler().fit(X_train)
#apply transformer to training set
X_train_scaled = scaler.transform(X_train)
print X_train_scaled.mean(axis=0)
print X_train_scaled.std(axis=0)
#apply transformer to test set
X_test_scaled = scaler.transform(X_test)
print X_test_scaled.mean(axis=0)
print X_test_scaled.std(axis=0)
#modeling pipeling = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators=1000))
pipeline = make_pipeline(preprocessing.StandardScaler(),
                         RandomForestRegressor(n_estimators=100))
#set hyperparameters
hyperparameters = {
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
    'randomforestregressor__max_depth': [None, 5, 3, 1]
weight = np.random.normal(loc=70, scale=10, size=1000).reshape(-1, 1)
print(height.shape, weight.shape)
height[:5], weight[:5]

# %%
# point (x,y)
original_data = np.concatenate((height, weight), axis=1)
print(original_data.shape)
original_data[:5]

# %%
plot(original_data, 'Original')

# %%
# 缩放到均值为0, 方差为1
standard_scaler_data = preprocessing.StandardScaler().fit_transform(
    original_data)
plot(standard_scaler_data, 'StandardScaler')

# %%
# 缩放到0和1之间
min_max_scaler_data = preprocessing.MinMaxScaler().fit_transform(original_data)
plot(min_max_scaler_data, 'MinMaxScaler')

# %%
# 缩放到-1和1之间
max_abs_scaler_data = preprocessing.MaxAbsScaler().fit_transform(original_data)
plot(max_abs_scaler_data, 'MaxAbsScaler')

# %%
# 缩放到0和1之间,保留原始数据的分布
normalizer_data = preprocessing.Normalizer().fit_transform(original_data)
                 header=None,
                 low_memory=False)
df, _ = prep_data('')
print(df.describe())

print('=== linear regression ===')
regr = linear_model.LinearRegression()
print('r2 = %.2f' % cross_val_score(
    regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='r2').mean())
print('rmse = %.2f' %
      np.sqrt(-1 * cross_val_score(regr,
                                   df.iloc[:, :-1],
                                   df.iloc[:, -1:],
                                   cv=10,
                                   scoring='neg_mean_squared_error')).mean())
regr = Pipeline([('trans', preprocessing.StandardScaler()), ('regr', regr)])
print('r2 = %.2f' % cross_val_score(
    regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='r2').mean())
print('rmse = %.2f' %
      np.sqrt(-1 * cross_val_score(regr,
                                   df.iloc[:, :-1],
                                   df.iloc[:, -1:],
                                   cv=10,
                                   scoring='neg_mean_squared_error')).mean())
print('=== ridge ===')
regr = linear_model.Ridge(alpha=.05)
print(cross_val_score(regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10).mean())
print('=== lasso ===')
regr = linear_model.Lasso(alpha=.05)
print(cross_val_score(regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10).mean())
print('=== Poly Linear ===')
    col_ind = np.repeat(np.arange(N).reshape([-1, N]), N, axis=0)
    row_ind = row_ind[mask_tri]
    col_ind = col_ind[mask_tri]

    #%% classifier and learning parameters

    # MLR adapted for recursive feature elimination (RFE)
    class RFE_pipeline(skppl.Pipeline):
        def fit(self, X, y=None, **fit_params):
            """simply extends the pipeline to recover the coefficients (used by RFE) from the last element (the classifier)
            """
            super(RFE_pipeline, self).fit(X, y, **fit_params)
            self.coef_ = self.steps[-1][-1].coef_
            return self

    c_MLR = RFE_pipeline([('std_scal', skprp.StandardScaler()),
                          ('clf',
                           skllm.LogisticRegression(C=10,
                                                    penalty='l2',
                                                    multi_class='multinomial',
                                                    solver='lbfgs',
                                                    max_iter=500))])

    # nearest neighbor
    c_1NN = sklnn.KNeighborsClassifier(n_neighbors=1,
                                       algorithm='brute',
                                       metric='correlation')

    # cross-validation scheme
    cv_schem = skms.StratifiedShuffleSplit(n_splits=1, test_size=0.2)
    n_rep = 10  # number of repetitions
feature_vectors.insert(1, 'FormationNum', 0)

for ii, formation in enumerate(feature_vectors['Formation'].unique()):
    feature_vectors.FormationNum[feature_vectors.Formation == formation] = ii

feature_vectors = feature_vectors.drop(['Formation'], axis=1)

# ***
# Normalizing and splitting data

# In[3]:

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

scaler = preprocessing.StandardScaler().fit(feature_vectors)
scaled_features = scaler.transform(feature_vectors)

X_train, X_test, y_train, y_test = train_test_split(scaled_features,
                                                    correct_facies_labels,
                                                    test_size=0.2,
                                                    random_state=42)

#%% Use tpot
from tpot import TPOTClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier

#tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,
#                      max_eval_time_mins = 20, max_time_mins=100, scoring='f1_micro')
#tpot.fit(X_train, y_train)
        robot_state = pd.concat([robot_positions, robot_velocitys], axis=1)
    else:
        robot_state_single_cam = pd.concat([robot_positions, robot_velocitys],
                                           axis=1)
        robot_state = pd.concat([robot_state, robot_state_single_cam], axis=0)

robot_states_list = robot_state.values.tolist()
list_ = []
for i in range(0, len(robot_state[0]), 4):
    list_.append(robot_states_list[i])
robot_states_frame_rate = pd.DataFrame(list_)
print("Done")

################################## Standardization for Robot States ###################################################################
robot_state_names = robot_states_frame_rate.columns
scaler = preprocessing.StandardScaler()
myScaler = scaler.fit(robot_states_frame_rate)
robot_states_frame_rate = myScaler.transform(robot_states_frame_rate)
robot_states_frame_rate = pd.DataFrame(robot_states_frame_rate,
                                       columns=robot_state_names)
print(robot_states_frame_rate.shape)

################################## Load Strawberry Data ##################################################################
strawberry_1 = pd.read_csv('/content/data_set_003/straw_1/data_set_' + str(0) +
                           '_strawberry_data_store_1.csv',
                           delimiter=',',
                           error_bad_lines=False,
                           header=None)
strawberry_2 = pd.read_csv('/content/data_set_003/straw_2/data_set_' + str(0) +
                           '_strawberry_data_store_2.csv',
                           delimiter=',',
Example #26
0
def read_hdf_data_psi(path = 'premix_data', key='of_tables', in_labels=['zeta','f','pv'], labels = ['T'], scaler = None):
    # read in the hdf5 file
    # AND COMPUTE PSI OF THE MIXTURE
    try:
        df = pd.read_hdf(path,key=key)
    except:
        print('Check the data path and key')

    # read the molar weigths
    with open('molar_weights.json', 'r') as fp:
        molar_weights = json.load(fp)

    # read in the order of the species names
    with open('GRI_species_order') as f:
         all_species = f.read().splitlines()

    # numpy array of species molar weights
    molar_weights_np = np.array([molar_weights[s] for s in all_species])
    molar_weights_np = molar_weights_np/ 1000   # conversion from g to kg! This is needed for OpenFOAM
    T_vector = df['T'].as_matrix()

    # convert to ndarray
    gri_mass_frac = df[all_species].as_matrix()

    # COMPUTE THE CORRECT PSI VALUE
    R_universal = 8.314459
    psi_list = []

    print('Starting to compute psi ... ')
    # iterate over all rows
    for index in range(0,df.shape[0]):
        R_m = R_universal * sum(gri_mass_frac[index,:] / molar_weights_np)
        #df['psi'].iloc[index] = 1 / (R_m * row['T'])
        psi_list.append(1/(R_m * T_vector[index]))
        # print(index)

    # hand back the data to df
    df['psi'] = psi_list
    print('Done with psi!\n')

    input_df=df[in_labels]

    if scaler=='MinMax':
        in_scaler = preprocessing.MinMaxScaler()
        out_scaler = preprocessing.MinMaxScaler()
    elif scaler=='Standard':
        in_scaler = preprocessing.StandardScaler()
        out_scaler = preprocessing.StandardScaler()
    else:
        raise ValueError('Only possible scalers are: MinMax or Standard.')

    input_np = in_scaler.fit_transform(input_df)

    label_df=df[labels]

    label_np = out_scaler.fit_transform(label_df)
    print('\n*******************************')
    print('The scaler is %s\n' % scaler)
    print('This is the order of the labels:')
    [print(f) for f in labels]
    print('*******************************\n')
    return input_np, label_np, df, in_scaler, out_scaler
Example #27
0
def crossValidationOneOption(filename, targetIdx, rankingMethod, algorithm, numFeatures, cv=5):
	"""
	This function evaluates the performance of the specified algorithm to predict target variable in the next year
	using cross validation and mean absolute value.

	This function is used in iteration 2.

	** This function is hard coded. Please be careful while editing. **

	Parameters
	----------
	filename : string
		The name of stacked file in iteration 2.

	targetIdx : int
		The index of target variable in the target list.

	rankingMethod : int
		The number specified how predictors are ranked. (See the file of ranked predictor lists)

	algorithm : string (usually 3-character)
		The code of prediction algorithm such as 'RID' for ridge regression.

	numFeatures : int
		The number of features included in the model.

		cv : int
		The number of fold for cross validation.

	Return Value
	----------
	None, but the results (error, sd, time spent) are printed.

	"""
		# filename = '2006-2013_FilteredColsTargetMissingBlank.csv'
	header = getHeader(filename)
	startTargetIndex, startPredictorIndex, numCols = getTargetAndPredictorIndex(header)
	numTargets = startPredictorIndex - startTargetIndex
	numPredictors = numCols - startPredictorIndex
	predictorHeader = header[startPredictorIndex:]
	targetHeader = header[startTargetIndex:startPredictorIndex]
	targetIDList = [int(head[0:4]) for head in targetHeader]

	dataset = np.genfromtxt(filename, delimiter=",", skip_header=1, autostrip=True, missing_values=np.nan, usecols=tuple(range(startTargetIndex,numCols)))
	# for targetIdx in range(0,numTargets):
	X = dataset[:,tuple(range(numTargets,dataset.shape[1]))]
	y = dataset[:,targetIdx]

	keepRows = np.invert(np.isnan(y))
	X = X[keepRows,:]
	y = y[keepRows]
	y = y.reshape(-1,1)

	Xscaler = preprocessing.StandardScaler().fit(X)
	Xscaler.transform(X)

	Yscaler = preprocessing.StandardScaler().fit(y)
	# Yscaler.transform(y)

	# print 'Target %d => Mean %.5f , STD %.5f' % (targetIdx, Yscaler.mean_, Yscaler.scale_)
	estimator = getEstimator(algorithm,Yscaler,numFeatures)
	if estimator is None:
		return algorithm + ': Wrong Algorithm'	
	
	XIndex = getFeaturesIndex(predictorHeader,'Feature Selection 2006-2013',targetIDList[targetIdx],rankingMethod,numFeatures)
	Xready = X[:,tuple(XIndex)]
	y = np.ravel(y)

	startTime = time.time()
	# crossValScoreList = cross_val_score(estimator, Xready, y, cv=cv, scoring='mean_absolute_error')
	predicted = cross_val_predict(estimator, Xready, y, cv=cv)
	absolute_error = np.absolute(y - predicted)
	score_mean = absolute_error.mean()
	score_sd = absolute_error.std()
	timeProcessed = time.time()-startTime
	
	print '%s = rankingMethod %d, algo %s, numFeatures %d, score(mean, sd) = (%f,%f), time = %f' % (targetHeader[targetIdx], rankingMethod, algorithm, numFeatures, score_mean, score_sd, timeProcessed)
def define_regressors():
    '''
    Define regressors to train the data with.
    All possible regressors should be added here.
    Regressors can be simple ones or pipelines that include standardisation or anything else.
    The parameters for the regressors are hard coded since they are expected to more or less
    stay constant once tuned.
    TODO: Include a feature selection method in the pipeline?
          That way it can be done automatically separately in each energy bin.
          (see https://scikit-learn.org/stable/modules/feature_selection.html).

    Returns
    -------
    A dictionary of regressors to train.
    '''

    regressors = dict()

    regressors['random_forest'] = RandomForestRegressor(n_estimators=300, random_state=0, n_jobs=8)
    regressors['MLP'] = make_pipeline(
        preprocessing.QuantileTransformer(output_distribution='normal', random_state=0),
        MLPRegressor(
            hidden_layer_sizes=(80, 45),
            solver='adam',
            max_iter=20000,
            activation='tanh',
            tol=1e-5,
            # early_stopping=True,
            random_state=0
        )
    )
    regressors['MLP_relu'] = make_pipeline(
        preprocessing.QuantileTransformer(output_distribution='normal', random_state=0),
        MLPRegressor(
            hidden_layer_sizes=(100, 50),
            solver='adam',
            max_iter=20000,
            activation='relu',
            tol=1e-5,
            # early_stopping=True,
            random_state=0
        )
    )
    regressors['MLP_logistic'] = make_pipeline(
        preprocessing.QuantileTransformer(output_distribution='normal', random_state=0),
        MLPRegressor(
            hidden_layer_sizes=(80, 45),
            solver='adam',
            max_iter=20000,
            activation='logistic',
            tol=1e-5,
            # early_stopping=True,
            random_state=0
        )
    )
    regressors['MLP_uniform'] = make_pipeline(
        preprocessing.QuantileTransformer(output_distribution='uniform', random_state=0),
        MLPRegressor(
            hidden_layer_sizes=(80, 45),
            solver='adam',
            max_iter=20000,
            activation='tanh',
            tol=1e-5,
            # early_stopping=True,
            random_state=0
        )
    )
    regressors['MLP_small'] = make_pipeline(
        preprocessing.QuantileTransformer(output_distribution='normal', random_state=0),
        MLPRegressor(
            hidden_layer_sizes=(36, 6),
            solver='adam',
            max_iter=20000,
            activation='tanh',
            tol=1e-5,
            # early_stopping=True,
            random_state=0
        )
    )
    regressors['MLP_lbfgs'] = make_pipeline(
        preprocessing.QuantileTransformer(output_distribution='normal', random_state=0),
        MLPRegressor(
            hidden_layer_sizes=(36, 6),
            solver='lbfgs',
            max_iter=20000,
            activation='logistic',
            tol=1e-5,
            # early_stopping=True,
            random_state=0
        )
    )
    regressors['BDT'] = AdaBoostRegressor(
        DecisionTreeRegressor(max_depth=30, random_state=0),
        n_estimators=1000, random_state=0
    )
    regressors['linear_regression'] = LinearRegression(n_jobs=4)
    regressors['ridge'] = Ridge(alpha=1.0)
    regressors['SVR'] = SVR(C=10.0, epsilon=0.2)
    regressors['linear_SVR'] = make_pipeline(
        preprocessing.StandardScaler(),
        LinearSVR(random_state=0, tol=1e-5, C=10.0, epsilon=0.2, max_iter=100000)
    )
    regressors['SGD'] = make_pipeline(
        preprocessing.StandardScaler(),
        SGDRegressor(loss='epsilon_insensitive', max_iter=20000, tol=1e-5)
    )

    return regressors
Example #29
0
def standard_scale(X_train, X_test):
    preprocessor = prep.StandardScaler().fit(X_train)
    X_train = preprocessor.transform(X_train)
    X_test = preprocessor.transform(X_test)
    return X_train, X_test
# In[18]:


y = df['loan_status'].values
y[0:5]


# ## Normalize Data 

# Data Standardization give data zero mean and unit variance (technically should be done after train test split )

# In[19]:


X= preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]


# # Classification 

# We use the test set to report the accuracy of the model
# we gona use the following algorithm:
# - K Nearest Neighbor(KNN)
# - Decision Tree
# - Support Vector Machine
# - Logistic Regression
# 

# # K Nearest Neighbor(KNN)
#