def fit(self, X, y, lam_fpc): n, self.p = X.shape if self.standardize: self.enc = SS().fit(X) else: self.enc = SS() self.enc.mean_ = np.repeat(0, self.p) self.enc.scale_ = np.repeat(1, self.p) Xtil = self.enc.transform(X) ybar = y.mean() lmax = max(np.abs(Xtil.T.dot(y - ybar) / n)) lmin = lmax * 0.001 lseq = np.exp(np.linspace(np.log(lmax), np.log(lmin), 100)) self.l1 = Lasso(fit_intercept=True, normalize=False, copy_X=False, warm_start=True) e2 = np.repeat(0.0, len(lseq)) ll2 = e2.copy() for ii, ll in enumerate(lseq): self.l1.alpha = ll self.l1.fit(Xtil, y) r2 = np.sqrt(sum((y - self.l1.predict(Xtil))**2)) e2[ii], ll2[ii] = r2, n * ll / r2 if ll2[ii] < lam_fpc: print('Found solution!') self.supp = np.where(~(self.l1.coef_ == 0))[0] self.l1 = LogisticRegression('l2', C=1000, fit_intercept=True, solver='lbfgs', max_iter=1000) self.l1.fit(Xtil[:, self.supp], y) break
def fit(self, Xtrain, ytrain, validation_data=None, **vae_kwargs): """ Fits a vae oversampler Arguments: Xtrain: training data ytrain: training labels validation_data = (Xtest,ytest) optional variational autoencoder kwargs: passed to keras Returns: none """ if validation_data is not None: Xtest, ytest = validation_data if self.rescale: self.ss = SS() self.ss.fit(Xtrain[ytrain == self.minority_class_id]) X = self.ss.transform(Xtrain[ytrain == self.minority_class_id]) if validation_data is not None: x_test = self.ss.transform( Xtest[ytest == self.minority_class_id]) else: X = Xtrain[ytrain == self.minority_class_id] if validation_data is not None: x_test = Xtest[ytest == self.minority_class_id] if validation_data is not None: self.build_train(X, x_test=x_test, **vae_kwargs) else: self.build_train(X, **vae_kwargs)
def normalise_numeric_features(X, standardisation=False, means=True, stdev=True): """ Normalisation for numeric features :param X: A numpy matrix of the data. First axis corresponding to instances, second axis corresponding to samples :param standardisation: Whether standardisation needs to be done instead of normalisation. Default: False :param means: Whether the mean should be normalised. Default: True :param stdev: Whether the standard devation should be normalised. Default: True :return: X and features with numeric features normalised. """ column_types = column_types_dataset(X, categorical=False) for i in range(len(column_types)): if column_types[i]: if standardisation: # Standardisation scaler = MMS([0, 1]) X[:, i:i + 1] = scaler.fit_transform(X[:, i:i + 1]) else: # Normalisation scaler = SS(means, stdev) X[:, i:i + 1] = scaler.fit_transform(X[:, i:i + 1]) return X
def __init__(self, mva=30, data=None): self.mva = mva self.scaler = SS() self.orig_data = data self.datasets = {} self.mvas = {} self.datasetcount = 0 if data is not None: self.datasets['orig'] = data
def standardizeData(self, df_orig): ''' standardize data with the help of sklearn's StandardScaler() class ''' scaler = SS() scaled_columns = scaler.fit_transform(df_orig[self.column_selection]) df_cp = df_orig[self.column_selection].copy() for num, column in enumerate(self.column_selection): df_cp[column + "_scaled"] = scaled_columns[:, num] return df_cp.iloc[:, len(self.column_selection):]
def scale_it(dat, tq=True): sh0, sh2 = dat.shape[0], dat.shape[2] s = SS( copy=False ) # copy=False does the scaling inplace, so we don't have to make a new list if tq: it = tqdm(range(sh0)) else: it = range(sh0) for j in it: # timesteps for i in range(sh2): # number of indicators/etc _ = s.fit_transform(dat[j, :, i].reshape(-1, 1))[:, 0]
def scale_pcts(df): feats = [ 'close-open_pct', 'high-low_pct', 'close-close_pct', 'open-open_pct', 'high-high_pct', 'low-low_pct', 'vol-vol_pct' ] scalers = [] for f in feats: sc = SS() df[f + '_scaled'] = sc.fit_transform(df[f].values.reshape(-1, 1)) scalers.append(sc) return df, scalers
def fit(self, X=None, y=None): """Pass. Parameters ---------- X Ignored y Ignored """ self._ss = SS(with_mean=self.norm_mean, with_std=self.norm_std) return self
def _calculate_component(XA: np.ndarray): """ Calculate each component to calculate dissimilarity for subsequences """ # distance component mu_XA = XA.mean(axis=0) # rotation component pca_XA = PCA() pca_XA.fit(SS().fit_transform(XA)) e_vector_XA = pca_XA.components_ # variance component p_XA = pca_XA.explained_variance_ return mu_XA, e_vector_XA, p_XA
def fit_resample(self, Xtrain, ytrain, validation_data=None, **vae_kwargs): """ Fits a vae oversampler and returns resampled dataset Arguments: Xtrain: training data ytrain: training labels validation_data = (Xtest,ytest) optional variational autoencoder kwargs: passed to keras Returns: Xres,yres: resampled data and labels. attempts to balance the dataset to 50% minority class """ if validation_data is not None: Xtest, ytest = validation_data num_samples_to_generate = max( Xtrain[ytrain != self.minority_class_id].shape[0] - Xtrain[ytrain == self.minority_class_id].shape[0], 100) if self.rescale: self.ss = SS() self.ss.fit(Xtrain[ytrain == self.minority_class_id]) X = self.ss.transform(Xtrain[ytrain == self.minority_class_id]) if validation_data is not None: x_test = self.ss.transform( Xtest[ytest == self.minority_class_id]) else: X = Xtrain[ytrain == self.minority_class_id] if validation_data is not None: x_test = Xtest[ytest == self.minority_class_id] if validation_data is not None: self.build_train(X, x_test=x_test, **vae_kwargs) else: self.build_train(X, **vae_kwargs) z_sample = np.random.normal(0, 1, (num_samples_to_generate, self.latent_dim)) outputs = self.decoder.predict(z_sample) if self.rescale: oversampled_X = self.ss.inverse_transform(outputs) else: oversampled_X = outputs oversampled_y = np.ones(num_samples_to_generate)\ * self.minority_class_id X_all = np.concatenate((Xtrain, oversampled_X)) y_all = np.concatenate((ytrain, oversampled_y)) return (X_all, y_all)
def rf(): X, y = make_classification(n_samples=1000, n_features=4, n_informative=2, n_redundant=0, random_state=0, shuffle=False) with open( r'D:\work\DL_Predicting_Pharmacological\data\result_file\Paper_result\random_data_notd.txt', 'r') as mesh_file: all_list = [] all_data = [] for mesh_lines in mesh_file: mesh_lines = mesh_lines.strip().split('\t') all_list.append(mesh_lines) label = [] all_data = np.array(all_list) all_data = SS().fit_transform(all_data) test_data = all_data[:16000] with open( r'D:\work\DL_Predicting_Pharmacological\data\result_file\Paper_result\random_label.txt', 'r') as label_file: for label_line in label_file: label_line = label_line.strip() label.append(int(label_line)) non_label = label #label=np_utils.to_categorical(label) vali_data = all_data[17001:17368] test_label = label[:16000] clf = RandomForestClassifier(max_depth=35, random_state=0) clf.fit(test_data, test_label) y = clf.predict(vali_data) pre_label = [] #print(y) # for i in y: # #i=list(i) # pre_label.append(i.index(max(i))) print(y, 'predict') print(non_label[17001:17368]) c = 0 for i in range(len(y)): if y[i] == non_label[17001:17368][i]: c += 1 print(c) print(c / len(y), 'rate')
def gen_log_problem_obj_uniq_ss(): df = utils.load_enroll() log_df = utils.load_log() arr = [] log_sz = len(log_df.groupby('enrollment_id')) for i, (eid, part_df) in enumerate(log_df.groupby('enrollment_id')): if i % 1000 == 0: l.info("{0} of {1}".format(i, log_sz)) ev = part_df[part_df['event'] == 'problem'] part_d = {'enrollment_id': eid} part_d['evuniq'] = len(ev['object'].unique()) arr.append(part_d) feat_df = pd.DataFrame(arr) df = df.merge(feat_df, how='left', on='enrollment_id').fillna(0) return {'X': SS().fit_transform(utils.reshape(df['evuniq']))}
def pca_plot(col_nums=None, ann_cutoff=1 / 3): pca = PCA(whiten=False) ss = SS() X = pca.fit_transform(ss.fit_transform(zcta_df.fillna(zcta_df.median()))) ann_factor = 20 cols = zcta_df.columns if col_nums is None: col_nums = list(range(len(cols))) N = len(col_nums) + 1 for coli in col_nums: for colj in col_nums: ploti = col_nums.index(coli) plotj = col_nums.index(colj) print(ploti, plotj, (plotj * N) + (ploti % N) + 1) plt.subplot(N, N, (plotj * N) + (ploti % N) + 1) plt.scatter(X[:, coli], X[:, colj], s=0.1) plt.xlabel("PC %s" % coli) plt.ylabel("PC %s" % colj) for i, col in enumerate(cols): arr = np.zeros(len(cols)) arr[i] = 1 proj = pca.transform([arr]) x, y = proj[0][[coli, colj]] norm = sqrt(x**2 + y**2) print("col", i, "has norm", norm, "in graph", coli, colj) if norm > ann_cutoff: plt.plot([0, ann_factor * x], [0, ann_factor * y]) plt.annotate(col, (ann_factor * x, ann_factor * y)) plt.subplot(N, N, (N)**2) plt.plot(pca.explained_variance_ratio_) plt.plot(np.cumsum(pca.explained_variance_ratio_)) for comp in range(len(pca.explained_variance_ratio_)): print("-" * 80) print(comp) print("explained variance ratio:", pca.explained_variance_ratio_[comp]) sorted_loadings = sorted(zip(zcta_df.columns, pca.components_[comp]), key=lambda xy: xy[1], reverse=True) for col, load in sorted_loadings: print(col, load) plt.show()
# In[8]: Y = data['class'] # actual output X = data[data.columns[:-1]] # input data features data, target = X, Y from sklearn.model_selection import train_test_split as SPLIT X_train, X_test, Y_train, Y_test = SPLIT(X, Y, test_size=0.3, random_state=4) # 70% Data for Training, 30% Data for Testing # ### Scale the Data # In[9]: from sklearn.preprocessing import StandardScaler as SS X = SS().fit_transform(X) # ## Train the Support Vector Classifier # In[10]: from sklearn.svm import SVC # Hyperparameters kernel = 'rbf' C = 13 gamma = 0.325 from time import time as T start = T() model = SVC(kernel=kernel, C=C, gamma=gamma)
imp.fit(X[:, 1:3]) X[:, 1:3] = imp.transform(X[:, 1:3]) #Encoding categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder Labelx = LabelEncoder() X[:, 0] = Labelx.fit_transform(X[:, 0]) print(X) from sklearn.compose import ColumnTransformer ct = ColumnTransformer( [('one_hot_encoder', OneHotEncoder(categories='auto'), [0])], remainder='passthrough') X = np.array(ct.fit_transform(X), dtype=np.float) Labely = LabelEncoder() Y = Labely.fit_transform(Y) print(Y) #Splitting the data into training and test from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler as SS sc_X = SS() x_train = sc_X.fit_transform(x_train) x_test = sc_X.transform(x_test)
df_full = df_full.fillna(-999) #-999 df_full['CR_AMB_Drop_EOP_1'] = df_full.CR_AMB_Drop_Build_1 * df_full.EOP_prev1 df_full['I_CR_AQB_EOP_1'] = df_full.I_CR_AQB_PrevQ1 / df_full.EOP_prev1 df_full['I_AQB_EOP_1'] = df_full.I_AQB_PrevQ1 / df_full.EOP_prev1 df_full['D_Prev1_EOP_1'] = df_full.D_prev1 / df_full.EOP_prev1 df_full['CR_AMB_Drop_1_D_prev1'] = df_full.CR_AMB_Drop_Build_1 / df_full.D_prev1 df_full['CR_AMB_Drop_2_D_prev1'] = df_full.CR_AMB_Drop_Build_2 / df_full.D_prev1 df_full['CR_AMB_Drop_1_vintage'] = df_full.CR_AMB_Drop_Build_1 / df_full.vintage df_full = df_full.replace([np.inf, -np.inf], np.nan) df_full = df_full.fillna(-999) df_full = SS().fit_transform(df_full) df_train = df_full[:300000] df_test = df_full[300000:] gc.collect() lgb_train = lgb.Dataset(df_train, Y) lgb_params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'nthread': -1, 'silent': True, 'num_leaves': 2**8 -1, 'learning_rate': 0.02, 'max_depth': 8, 'max_bin': 2**8 -1, 'metric': 'auc', 'colsample_bytree': 0.33, #0.4
Y = encoder.transform(Y) #new_Y = pd.DataFrame(encoded_Y) '''print('\n X.head(10): after 1 hot') print(X.head(10)) print('\n new_Y.head(10)') print(new_Y.head(10))''' # convert X and new_Y to numpy arrays X = X.values print('\n new Y.shape') print(Y.shape) # standardize X scaler = SS().fit(X) rescaledX = scaler.transform(X) # split into train test sets using t_t_s # because we combined the datasets to apply uniform # one hot and label encoding, we set 'shuffle' parameter as false # we also know that there should be 15060 rows in the test sets test_set_size = test_dataset_nomissing.shape[0] print('\n test_set_size...') print(test_set_size) X_train, X_test, Y_train, Y_test = t_t_s(rescaledX, Y, test_size=test_set_size, random_state=seed, shuffle=False) # instantiate XGBC class using defaults model = XGBC() # evaluate the model against the training datset using stratified kfold
# Importing the dataset dataset = pd.read_csv('Social_Network_Ads.csv') X = dataset.iloc[:, [2, 3]].values Y = dataset.iloc[:, 4].values #Splitting the data into training data and test data from sklearn.model_selection import train_test_split X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size=0.2, random_state=0) #Feature scaling has to be done in K_nearest neighbors algo #Feature Scaling from sklearn.preprocessing import StandardScaler as SS sc = SS() X_Train = sc.fit_transform(X_Train) X_Test = sc.transform(X_Test) #Creating the model and fitting it to data from sklearn.neighbors import KNeighborsClassifier as KNC classifier = KNC(n_neighbors=5, metric='minkowski', p=2) classifier.fit(X_Train, Y_Train) #Predicting the result y_pred = classifier.predict(X_Test) #Creatingt the confusion matrix for finding the accuracy of model from sklearn.metrics import confusion_matrix #here confusion_matrix is a method not a class cm = confusion_matrix(Y_Test, y_pred)
def __init__(self): self._scaler = SS()
def make_zip_pca(zcta_df): pca = PCA(whiten=False) ss = SS() X = pca.fit_transform(ss.fit_transform(zcta_df.fillna(zcta_df.median()))) return X
# In[8]: datax.columns = (datax.iloc[0]) datax.drop(0, inplace=True, axis=1) datax.drop('!series_matrix_table_end', axis=1, inplace=True) datax.drop('Probe_ID', inplace=True) datax.head() # datax.to_csv('Alzh_Features_Wrangled.csv') # datay.to_csv('Alzh_Labels_Wrangled.csv') # In[ ]: from sklearn.preprocessing import StandardScaler as SS from sklearn.neural_network import MLPClassifier as MLP from sklearn.pipeline import Pipeline SS = SS() clf = MLP() #print(clf.get_params().keys()) pipe = Pipeline(steps=[('scaler', SS), ('MLP', clf)]) params = { 'MLP__hidden_layer_sizes': list(range(1000, 30000, 1000)), 'MLP__activation': ['logistic', 'tanh', 'relu'] } grid_search = GSCV(pipe, params, cv=8, scoring='accuracy') grid_search.fit(datax, datay) best_act = grid_search.best_params_.get('MLP__activation') best_hl = grid_search.best_params_.get('MLP__hidden_layer_size') print('Best Parameters:', grid_search.best_params_) print("Accuracy:", grid_search.best_score_)
def Standard_norm(arr): scalerSS = SS() scalerSS.fit(arr) arrSS = scalerSS.transform(arr) return arrSS
targets = data.Survived; targets = targets.map({'Died': 0, 'Survived':1}) data.drop(columns=['Survived'],inplace=True) data.Pclass = data.Pclass.map({'Poor':1, 'Medium':2, 'Upper':3}) data_model = pd.get_dummies(data=data, drop_first=True) X_train, X_test, y_train, y_test = train_test_split(data_model, targets, test_size=0.15, random_state=seed, stratify=targets) #Model Instantiation logit = LR(random_state=seed,solver='lbfgs',max_iter=300) rf = RFC(n_estimators=250, random_state=seed) gb = GBC(n_estimators=250, random_state=seed) xgb = xgb.XGBClassifier(objective='reg:logistic', n_estimators=250, seed=42) svm = SVC(random_state=seed,probability=True) models = [logit, rf,gb,xgb, svm] labels = ['Died', 'Survived'] scaler = SS(); X_train = scaler.fit_transform(X_train); X_test = scaler.transform(X_test) #FOR SVM def fit_metrics(model, Xtr, ytr, Xts, yts, labels): print(model.__class__.__name__ + ' Results:') model.fit(Xtr, ytr) cm = m.confusion_matrix(yts, model.predict(Xts)) plot_matrix(cm, classes=labels, normalize=True, title='Confusion Matrix for Titanic Test Data'); plt.show() plot_roc_auc(yts, logit.predict_proba(Xts)[:,1]) plot_precision_recall(yts, logit.predict_proba(Xts)[:,1]) classification_metrics(yts,logit.predict(Xts)) #need to add Cross-Validation for more reliable results, here we get Bird's Eye view for model in models: print('*'*25) fit_metrics(model, X_train, y_train, X_test, y_test, labels)
CLR = list(range(len(Y))) for i in range(len(Y)): if (Y[i] == 0): CLR[i] = 'a' elif (Y[i] == 0.19): CLR[i] = 'b' elif (Y[i] == 2.5): CLR[i] = 'c' elif (Y[i] == 4.5): CLR[i] = 'd' Y = CLR #Standard Scaling the data. from sklearn.preprocessing import StandardScaler as SS ss = SS() X = ss.fit_transform(X) #train_test splitting for analysis of optimal number of parameters. from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) X_cv, X_test, Y_cv, Y_test = train_test_split(X_test, Y_test, test_size=0.5, random_state=0) #Modelling
with open(r'/Users/libingrui/Desktop/Work_file/Data/0610_path_data.txt', 'r') as mesh_file: all_list = [] all_data = [] for mesh_lines in mesh_file: mesh_lines = mesh_lines.strip().split('\t') all_list.append(mesh_lines) with open(r'/Users/libingrui/Desktop/Work_file/Data/0610_label.txt', 'r') as label_file: for label_line in label_file: label_line = label_line.strip() label.append(int(label_line)) # X=dt # y=label all_data = np.array(all_list) all_data = SS().fit_transform(all_data) #label=np_utils.to_categorical(label) X_train = all_data[:3000] X_test = all_data[30001:3985] y_train = label[:3000] y_test = label[3001:3985] #print(y_train) # 算法参数 params = { 'booster': 'gbtree', 'objective': 'multi:softmax', 'num_class': 6, 'gamma': 0.1, 'max_depth': 6, 'lambda': 2, 'subsample': 0.7,
"matrix", type=str, help="the kmer frequency matrix file in csv format, with header and index") parser.add_argument("label", type=str, help="the label of matrix") args = parser.parse_args() # load your data and label as text format allmatrix = pd.read_csv(args.matrix, header=0, index_col=0, low_memory=True).values target = np.loadtxt(args.label) print("allmatrix shape: {};label shape: {}".format(allmatrix.shape, target.shape)) # standarize your data allmatrix = SS().fit_transform(allmatrix) # transform your data to tensor x = tf.convert_to_tensor(allmatrix, dtype=tf.float32) y = tf.convert_to_tensor(target, dtype=tf.int32) # split train, validation, and test data with ratio 7:1:2 idx = tf.range(allmatrix.shape[0]) idx = tf.random.shuffle(idx) x_train, y_train = tf.gather(x, idx[:int(0.7 * len(idx))]), tf.gather( y, idx[:int(0.7 * len(idx))]) x_val, y_val = tf.gather( x, idx[int(0.7 * len(idx)):int(0.8 * len(idx))]), tf.gather( y, idx[int(0.7 * len(idx)):int(0.8 * len(idx))]) x_test, y_test = tf.gather(x, idx[int(0.8 * len(idx)):]), tf.gather( y, idx[int(0.8 * len(idx)):])
resultant = sorted(resultant, key=lambda x: x[1], reverse=True) print("\n**************************\n") resultant = [x for x, _ in resultant] #resultant.sort(reverse = True) print(resultant) vectors = [] for d, _ in enumerate(chapters): vectors = vectors + [[]] for j in resultant: if j in chapters[d]: vectors[d] = vectors[d] + [chapters[d][j]] else: vectors[d] = vectors[d] + [0] vectors = np.array(vectors) print(vectors) vectors = SS().fit_transform(vectors) print(vectors) pca = PCA(n_components=2) pca.fit(vectors) print(pca.components_) scores = pca.transform(vectors) labels = [] for x in range(comeco, fim): labels += [x] plt.scatter(scores[:, 0], scores[:, 1]) for i, l in enumerate(labels): plt.annotate(l, xy=(scores[i, 0], scores[i, 1]),
X_train, X_test, y_train, y_test = SPLIT(wine.data, wine.target, test_size=0.25, stratify=wine.target, random_state=123) # printing class distribution of test dataset print(f'Classes: {np.unique(y_test)}') print(f'Class distribution for test data: {np.bincount(y_test)}') # MLP is sensitive to feature scaling, hence performing scaling # Options: MinmaxScaler and Standardscaler # from sklearn.preprocessing import MinMaxScaler # scaler = MinMaxScaler() from sklearn.preprocessing import StandardScaler as SS X_train_stdsc = SS().fit_transform(X_train) X_test_stdsc = SS().fit_transform(X_test) # Setting of hyperparameters of the network from sklearn.neural_network import MLPClassifier as MLP mlp = MLP(hidden_layer_sizes=(10, ), learning_rate_init=0.001, max_iter=5000) # Calculating Training Time : more neurons, more time from time import time start = time() # Train the model using the scaled training sets mlp.fit(X_train_stdsc, y_train) end = time() print(f'Training Time: {(end-start)*1000:.3f}ms') # Predict the response for test dataset
good_up = resample(good, replace=True, n_samples=550, random_state=seed) #Dimension reduction vs w/out Dimension reduction from helper_funcs import model_reduce, plot_roc_auc, plot_precision_recall, fit_metrics from sklearn.model_selection import train_test_split as tts from sklearn.preprocessing import StandardScaler as SS from sklearn.model_selection import RandomizedSearchCV as RSCV qualities = wines['quality'] wines.drop(columns=['quality'], inplace=True) X_train, X_test, y_train, y_test = tts(wines, qualities, test_size=0.2, random_state=seed, stratify=qualities) scaler = SS() X_train.loc[:, X_train.columns] = scaler.fit_transform( X_train.loc[:, X_train.columns]) X_test.loc[:, X_test.columns] = scaler.transform(X_test.loc[:, X_test.columns]) from sklearn.ensemble import RandomForestClassifier as RFC, GradientBoostingClassifier as GBC from sklearn.linear_model import LogisticRegression as LR import xgboost as xgb from sklearn.metrics import r2_score logit = LR(random_state=seed, solver='lbfgs', max_iter=300, multi_class='auto') rf = RFC(n_estimators=250, random_state=seed) gb = GBC(n_estimators=250, random_state=seed) xgb = xgb.XGBClassifier(objective='reg:logistic', n_estimators=250, seed=42) models = [logit, rf, gb, xgb]
b=pd.DataFrame(pr.normalize(y,norm='l1',axis=0)) b.iloc[:,0].apply(lambda x:abs(x)).sum() #l2-norm is unit euclidean distance. Consider a vector in a space. To convert it into unit distance, we use l2-norm i.e. divide it by sqrt(a^2+b^2) #l1-norm is unit Manhattan distance. So in this case, the vector is divided by abs(a)+abs(b) #Standardization is centering with unit variance #Axis=0 implies a feature(Column) while Axis =1 implies a sample(Row) #http://scikit-learn.org/stable/modules/preprocessing.html c=pd.DataFrame(pr.scale(y,axis=0)) #How to standardise training and test data x=pd.DataFrame({'a':[randint(-10,10) for i in range(20)]}) y=pd.DataFrame({'b':[randint(-10,10) for i in range(20)]}) scalar=SS().fit(x) x_scalar=scalar.transform(x) y_scalar=scalar.transform(y) scalar.mean_ scalar.var_ state=list(np.repeat('PA',5)) total=[randint(10000,500000) for i in range(5)] Obama=[round(randint(0,300)/3.0,3) for i in range(5)] Romney=[round(randint(0,300)/3.0,3) for i in range(5)] winner=[ randint(0,1) for i in range(5) ] election=pd.DataFrame({'state':state,'total':total,'Obama':Obama,'Romney':Romney,'winner':winner}) election['winner']= election['winner'].apply(lambda x: 'Romney' if x==1 else 'Obama') election['country']=['Adams','Allegheny','Armstrong','Beaver','Bedford'] election.set_index('country',inplace=True)