def convert_row(row, scale): """Convert a CCEPC row into numpy.ndarrays. :param row: :type row: pandas.Series :return: tuple of sample ID and the converted data into numpy.ndarrays :rtype: tuple """ a = row["A"].split(" ") b = row["B"].split(" ") if a[0] == "": a.pop(0) b.pop(0) if a[-1] == "": a.pop(-1) b.pop(-1) a = array([float(i) for i in a]) b = array([float(i) for i in b]) if scale: a = scaler(a) b = scaler(b) return row['SampleID'], a, b
def outlier_transform(X): num_features = list(X.select_dtypes(include=['float64', 'int64'])) X[num_features] = scaler().fit_transform(X[num_features]) for var in X.select_dtypes(include=['float64', 'int64']): # scaler before X = X[np.abs(X[var] - X[var].mean()) <= (3 * X[var].std())] return X
def preprocess_data(self): # Step 1 - One Hot Encode self.get_categorical_columns() print('Step 2 - Categorical Column Identification Complete ...') self.x_train = pd.get_dummies(self.x_train, columns=self.categorical_columns, prefix='one_hot_encoded_') self.get_training_columns(self.x_train) # Hotfix for XGBoost for column in self.traincols: if ("<" in column): self.x_train.rename(index=str, columns={column: column.replace("<", "")}, inplace=True) self.get_training_columns(self.x_train) encoded_columns = [i for i in self.traincols if "one_hot_encoded_" in i][:-1] not_encoded_columns = [i for i in self.traincols if "one_hot_encoded_" not in i] self.x_train = self.x_train[self.union(encoded_columns, not_encoded_columns)] self.get_training_columns(self.x_train) print('Step 3 - One Hot Encoding Complete ...') # Step 2 - Null Value Impute imputer = Imputer(strategy='mean', copy=False) self.x_train = pd.DataFrame(data=imputer.fit_transform(self.x_train), columns=self.traincols) print('Step 4 - Null Value Imputation Complete ...') # Step 3 - Feature Scaling sc_X = scaler(copy=False) self.x_train[not_encoded_columns] = sc_X.fit_transform(self.x_train[not_encoded_columns]) print('Step 5 - Standardisation Complete ...') self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x_train, self.y_train, test_size=0.2, random_state=1) print('Step 5 - Train Test Splitting Complete ...') print('Shape:' + str(self.x_train.shape)) return self.df, self.x_train, self.y_train, self.x_test, self.y_test, self.traincols, self.categorical_columns
def methodANM_IGCI(X, Y): answerANM = 0 answerIGCI = 0 data = pd.Series({"X": scaler(X), "Y": scaler(Y)}) m = ANM() pred = m.predict(data) #print(pred, "(ANM, Value : 1 if X->Y and -1 if Y->X)") if (pred > 0): answerANM = 1 m = IGCI() pred = m.predict(data) #print(pred[0], "(IGCI, Value: >0 if X->Y and <0 if Y->X)") if (pred > 0): answerIGCI += 1 return answerANM, answerIGCI
def dataset(self, a, b, scale=False, shape=(-1, 1)): """Produce a PairwiseDataset of two variables out of the data. Args: a (str): Name of the first variable b (str): Name of the second variable scale (bool): scale the data with 0 mean and 1 variance. shape (tuple): desired shape of `torch.Tensor` of `a` and `b` Returns: cdt.utils.io.MetaDataset: the new pairwise dataset """ a = self.data[:, self.names[a]] b = self.data[:, self.names[b]] if scale: a = scaler(a) b = scaler(b) return PairwiseDataset(th.Tensor(a).view(*shape), th.Tensor(b).view(*shape))
def individual_images_to_pca(ind_image): img = cv2.resize(ind_image, (64, 64)) edges = cv2.Canny(img, 64, 64) edges = edges.reshape(1, 4096) pca = PCA(.95) s = scaler() x = s.fit_transform(edges) lower_dimension = pca.fit_transform(x) approximation = pca.inverse_transform(lower_dimension) return lower_dimension, approximation
def clean_df(df): df = df.fillna("None") df = df.replace([""," ","None"],[None,None,None]) numeric_columns = ["Bateria","CamaraFrontal","CamaraPosterior","Garantia", "MemoriaInterna","RAM","Price","PuntajeAntutu","PuntajeK"] units = ["mAh","Mpx","Mpx","Meses","GB","GB","$","",""] df = clean_scale_numeric_columns(df, numeric_columns,units) df["ResistenciaAgua"] = df.apply(lambda row:get_watter_resistance(row["ResistenciaAgua"]),axis=1) df["Resolucion"] = df.apply(lambda row:get_resolution(row["Resolucion"]),axis=1) df["Resolucion_S"] = scaler(feature_range=(0.1,1)).fit_transform(np.array(df["Resolucion"]).reshape(1,-1).transpose()) df["Score"] = df.apply(score,axis=1) df["CalidadPrecio"]=df.apply(price_quality,axis=1) return df
def to_pca(x): dir = '../Models2/' pca = PCA(.95) s = scaler() x = s.fit_transform(x) lower_dimension = pca.fit_transform(x) approximation = pca.inverse_transform(lower_dimension) dims = pca.n_components_ joblib.dump(s.scale_, dir + 'scaler') with open(dir + 'pca', 'wb') as file: pickle.dump(pca, file) np.save(dir + 'lower_dimension', lower_dimension) np.save(dir + 'approximation', approximation) return lower_dimension, approximation
def reshape_data(df_data, list_variables, type_variables): list_array = [] dim_variables = {} for var in list_variables: if (type_variables[var] == "Categorical"): data = df_data[var].values data = get_dummies(data).as_matrix() data = data.reshape(data.shape[0], data.shape[1]) elif (type_variables[var] == "Numerical"): data = scaler(df_data[var].values) data = data.reshape(data.shape[0], 1) dim_variables[var] = data.shape[1] list_array.append(data) return concatenate(list_array, axis=1), dim_variables
def __init__(self, data, names=None, device=None, scale=True): super(MetaDataset, self).__init__() if names is not None: self.names = names else: try: assert isinstance(data, DataFrame) except AssertionError: raise TypeError('If names is not specified, \ data has to be a pandas.DataFrame') self.names = OrderedDict([(i, idx) for idx, i in enumerate(data.columns)]) if isinstance(data, DataFrame): data = data.values if scale: self.data = th.Tensor(scaler(data)) else: self.data = th.Tensor(data) if device is not None: self.data = self.data.to(device)
import matplotlib.pyplot as plt warnings.filterwarnings('ignore') import pandas as pd from sklearn import decomposition #from sklearn import datasets from sklearn.preprocessing import StandardScaler as scaler import os os.chdir( "C:/Users/NgocBien/Desktop/MachineLearningProjet/MachineLearning/TPML/TPML" ) data2 = pd.read_csv('./crime.csv', sep=';') X2 = data2.ix[:, 1:7].values labels2 = data2.ix[:, 0].values pca = decomposition.PCA(n_components=3) #ces codes en base nous permettent de savoir combien d'infos qu'on garde #quand on fait de ACP. X2_norm = scaler().fit_transform(X2) pca.fit(X2_norm) print(pca.singular_values_) print(pca.explained_variance_ratio_) # On recupere les coordonnees de PCA sur 3 axes et on fait la projection #sur les 2 premieres axes. X2_pca = pca.fit_transform(X2) import matplotlib plt.scatter(X2_pca[:, 0], X2_pca[:, 1]) for label, x, y in zip(labels2, X2_pca[:, 0], X2_pca[:, 1]): plt.annotate(label, xy=(x, y), xytext=(-0.2, 0.2), textcoords='offset points') plt.show()
def clean_scale_numeric_columns (df,columns,units): for column,unit in list(zip(columns,units)): df[column]=df.apply(lambda row: clean_numeric_row(row[column],unit),axis=1) df["{}_S".format(column)]=scaler(feature_range=(0.1,1)).fit_transform(np.array(df[column]).reshape(1,-1).transpose()) return df
import numpy as np import matplotlib.pyplot as plt import pandas as pd #importar dataset df=pd.read_csv("Position_Salaries.csv") #seleccionar datos para las variables independientes(x) y la y x=df.iloc[:, 1:2].values y=df.iloc[:,-1].values #Escalado de variables from sklearn.preprocessing import StandardScaler as scaler sc_x=scaler() sc_y=scaler() x= sc_x.fit_transform(x) y= sc_y.fit_transform(y.reshape(-1,1)) #------<AJUSTAR MODELOS DE REGRESION>---------- #Ajustar regresion lineal con el dataset """ from sklearn.linear_model import LinearRegression linearR=LinearRegression() linearR.fit(x,y) print(linearR) """
def main(): np.random.seed(42) urls = [ 'http://www.ehu.eus/ccwintco/uploads/6/67/Indian_pines_corrected.mat', 'http://www.ehu.eus/ccwintco/uploads/c/c4/Indian_pines_gt.mat', ] for url in urls: download_dataset(url) gt = load_data(DATA / 'Indian_pines_gt.mat') plt.imsave(IMG / 'gt.png', gt) ipc = load_data(DATA / 'Indian_pines_corrected.mat') p111 = scale2int(ipc[..., 111]) plt.imsave(IMG / '111.png', p111) plt.imsave(IMG / '111_canny.png', canny(p111)) data = get_data(DATA / 'indian_pines.csv', gt, ipc) X = data.copy().astype(np.float64) y = X.pop('target').astype(int) unique_y = len(y.unique()) X2 = scaler().fit(X).transform(X) n_components = 4 pca = PCA(n_components=n_components).fit(X2, y) X_pca = pca.fit_transform(X2) fig, ax = plt.subplots(1, 1) ax.set_xlabel('Principal Components') ax.set_ylabel('Variance Ratio') ax.set_title('Variance ratio for PCA on Indian Pines dataset') ax.grid() ax.set_xticks(range(1, n_components + 1)) ax.bar(range(1, n_components + 1), pca.explained_variance_ratio_) fig.savefig(IMG / 'pca_components.png') colorlist = np.random.choice(list(cnames.keys()), unique_y, replace=False).tolist() colors = y.map(lambda x: colorlist[x]) df = pd.DataFrame(X_pca[:, :2]) df = pd.concat([df, y, colors], axis=1) df.columns = ['PC1', 'PC2', 'target', 'color'] df_0 = df[df['target'] != 0] fig, ax = plt.subplots(1, 1) ax.set_xlabel('PC-1') ax.set_ylabel('PC-2') ax.set_title('PCA on Indian Pines dataset') ax.grid() ax.scatter(df_0['PC1'], df_0['PC2'], color=df_0['color'], s=3) fig.savefig(IMG / 'pc1_pc2.png') img = (df['PC1'] + df['PC2']).values.reshape((145, 145)) plt.imsave(IMG / 'pc12.png', img) c = canny(img, sigma=2., low_threshold=.15, high_threshold=.6, use_quantiles=True) plt.imsave(IMG / 'pc12_canny.png', c) gt2 = cv2.imread((IMG / 'gt.png').as_posix(), 0) plt.imsave(IMG / 'gt_canny.png', canny(gt2))
# trX, teX, trY, teY = _read_split( # "../datasets/nd-data/boundary.csv", # read=1,oneHot=0) #Integrating smote with daego #perform smote at the intermediate stage of training via stacked denoising encoder from algorithms.utils import _read_dat trX, teX, trY, teY = _read_dat("dataset/page-blocks0.dat", skip=15, read=1, oneHot=0) scaler = scaler() trX = scaler.fit_transform(trX) teX = scaler.fit_transform(teX) from mlxtend.tf_classifier import TfSoftmaxRegression trY = trY.astype(int) print trX.shape[1], "Input Feature Space" print "Enter Layers" layer = input() print "Enter the leyer no after smote to be performed" l_s = int(input()) l_encoder = layer[:l_s] model_bs = StackedAutoEncoder( dims=l_encoder,
# trX, teX, trY, teY = _read_split( # "../datasets/nd-data/boundary.csv", # read=1,oneHot=0) #Integrating smote with daego #perform smote at the intermediate stage of training via stacked denoising encoder from algorithms.utils import _read_dat trX, teX, trY, teY = _read_dat( "dataset/page-blocks0.dat",skip=15, read=1,oneHot=0) scaler=scaler() trX=scaler.fit_transform(trX) teX=scaler.fit_transform(teX) from mlxtend.tf_classifier import TfSoftmaxRegression trY=trY.astype(int) print trX.shape[1],"Input Feature Space" print "Enter Layers" layer=input() print "Enter the leyer no after smote to be performed" l_s=int(input()) l_encoder=layer[:l_s] model_bs = StackedAutoEncoder(dims=l_encoder, activations=['tanh' for i in range(len(l_encoder))], noise='gaussian', epoch=[10000 for i in range(len(l_encoder))],loss='rmse',
def min_max_scaling(features_train,features_test): from sklearn.preprocessing import MinMaxScaler as scaler features_train = scaler().fit_transform(features_train) features_test = scaler().fit_transform(features_test) return features_train, features_test
bbox_inches='tight') plt.close("all") # Make an array of the data to be used for clustering, # and delete pca_slices, scaled_slices, energy and amplitudes n_pc = 3 data = np.zeros((len(pca_slices), n_pc + 2)) data[:, 2:] = pca_slices[:, :n_pc] data[:, 0] = energy[:] / np.max(energy) data[:, 1] = np.abs(amplitudes) / np.max(np.abs(amplitudes)) data = np.concatenate((data, pca_autocorr[:, :3]), axis=-1) data = np.concatenate((data, conv_pca_slices), axis=-1) # Standardize features in the data since they # occupy very uneven scales standard_data = scaler().fit_transform(data) # We can whiten the data and potentially use # diagonal covariances for the GMM to speed things up # Not sure how much this step helps data = pca(whiten='True').fit_transform(standard_data) del pca_slices del scaled_slices del energy del slices_autocorr, scaled_autocorr, pca_autocorr # Set a threshold on how many datapoints are used to FIT the gmm dat_thresh = 10e3 # Run GMM, from 2 to max_clusters for i in range(max_clusters - 1):
remainder="passthrough") # Leave the rest of the columns untouched X = onehotencoder.fit_transform(X) X = X[:, 1:] #dividir dataset en conjunto de entrenamiento y testing from sklearn.model_selection import train_test_split as splitter x_train, x_test, y_train, y_test = splitter(X, y, test_size=0.2, random_state=0) #Escalado de variables from sklearn.preprocessing import StandardScaler as scaler scala_x = scaler() x_train = scala_x.fit_transform(x_train) x_test = scala_x.transform(x_test) #------<AJUSTAR MODELOS DE CLASIFICACION>---------- """ #Ajustar regresion con el el conjunto de entrenamiento #Crear modelo de clasificación aqui from sklearn.linear_model import LogisticRegression classifier=LogisticRegression(random_state=0) classifier.fit(x_train,y_train) print(classifier)
dict_of_df = {k: pd.DataFrame(v) for k, v in self.result.items()} result_df = pd.concat(dict_of_df, axis=1) result_df.to_csv(os.path.join(self.path, 'result.csv')) with open(os.path.join(self.path, 'grid_search_params'), 'w') as f: yaml.dump(dict(self.grid_params), f, default_flow_style=False) def range_nfo(min_n_samples, n_features, n_points): return np.unique( np.linspace(2, min(min_n_samples, n_features), n_points, dtype=int)).tolist() if __name__ == '__main__': simple_knn = Pipeline([('scaler', scaler()), ('knn', knn())]) lmnn_knn = Pipeline([('scaler', scaler()), ('lmnn', lmnn()), ('knn', knn())]) nca_knn = Pipeline([('scaler', scaler()), ('nca', nca()), ('knn', knn())]) pca_knn = Pipeline([('scaler', scaler()), ('pca', pca()), ('knn', knn())]) cfg_file = sys.argv[1] with open(cfg_file, 'r') as f: config = yaml.load(f) datasets = config['datasets'] gs = GS(n_folds=3, random_state=RANDOM_SEED) for dataset_name in datasets: print("Benchmarking dataset {}...".format(dataset_name)) dataset_func = DATASETS[dataset_name]