def calculate_district_dis(dis_style = "euclidean"): print(dis_style) cal_what_dis = calculate_function[dis_style] poi_df = pd.read_csv(os.path.join(DATA_DIR, CONCRETE_DIR, POI_SHEET_DIR, "poi_data.csv")) # get all the poi data in dataframe districts_poi = poi_df.values[:, 1:] scaler = MaxAbsScaler() scalered_districts_poi = scaler.fit_transform(districts_poi) if dis_style == "canberra": scalered_districts_poi = districts_poi result = OrderedDict() for based_d in range(districts_poi.shape[0]): result[based_d + 1] = OrderedDict() based_district_poi = scalered_districts_poi[based_d] for c_d in range(districts_poi.shape[0]): compare_district_poi = scalered_districts_poi[c_d] result[based_d + 1][c_d + 1] = cal_what_dis(based_district_poi, compare_district_poi) result[based_d + 1] = sorted(result[based_d + 1].items(), key=lambda d:d[1]) return result
def scale(df, scaling=None): """Scale data included in pandas dataframe. Parameters ---------- df : pandas dataframe dataframe to scale scaling : 'maxabs', 'minmax', 'std', or None, optional (default 'std') type of scaling to apply """ if scaling is None or scaling.lower() == 'none': return df df = df.dropna(axis=1, how='any') # Scaling data if scaling == 'maxabs': # Normalizing -1 to 1 scaler = MaxAbsScaler() elif scaling == 'minmax': # Scaling to [0,1] scaler = MinMaxScaler() else: # Standard normalization scaler = StandardScaler() mat = df.as_matrix() mat = scaler.fit_transform(mat) df = pd.DataFrame(mat, columns=df.columns) return df
def load_data(shuffle=True, n_cols=None): train_path = get_p1_file('http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/P1B1.train.csv') test_path = get_p1_file('http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/P1B1.test.csv') usecols = list(range(n_cols)) if n_cols else None df_train = pd.read_csv(train_path, engine='c', usecols=usecols) df_test = pd.read_csv(test_path, engine='c', usecols=usecols) df_train = df_train.drop('case_id', 1).astype(np.float32) df_test = df_test.drop('case_id', 1).astype(np.float32) if shuffle: df_train = df_train.sample(frac=1, random_state=seed) df_test = df_test.sample(frac=1, random_state=seed) X_train = df_train.as_matrix() X_test = df_test.as_matrix() scaler = MaxAbsScaler() mat = np.concatenate((X_train, X_test), axis=0) mat = scaler.fit_transform(mat) X_train = mat[:X_train.shape[0], :] X_test = mat[X_train.shape[0]:, :] return X_train, X_test
def impute_and_scale(df, scaling='std'): """Impute missing values with mean and scale data included in pandas dataframe. Parameters ---------- df : pandas dataframe dataframe to impute and scale scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std') type of scaling to apply """ df = df.dropna(axis=1, how='all') imputer = Imputer(strategy='mean', axis=0) mat = imputer.fit_transform(df) if scaling is None or scaling.lower() == 'none': return pd.DataFrame(mat, columns=df.columns) if scaling == 'maxabs': scaler = MaxAbsScaler() elif scaling == 'minmax': scaler = MinMaxScaler() else: scaler = StandardScaler() mat = scaler.fit_transform(mat) df = pd.DataFrame(mat, columns=df.columns) return df
def normalize_raw_features(X: np.array) -> np.array: """Normalize features if column was not OneHot encoded""" for col in range(X.shape[1]): dense_col = X[:, col].todense() if (dense_col > 1.).any() or (dense_col < 0.).any(): scaler = MaxAbsScaler().fit(dense_col) X[:, col] = csr_matrix(scaler.transform(dense_col)) return X
def cluster_user(self): user_feature_matrix = self.__extract_user_feature() user_feature_matrix = user_feature_matrix.tocsr() user_feature_matrix= MaxAbsScaler().fit_transform(user_feature_matrix) #model = DBSCAN(eps=0.5, min_samples=100).fit(user_feature_matrix) model = MiniBatchKMeans(n_clusters=50,max_iter=10000).fit(user_feature_matrix.toarray()) labels = model.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) user_label_dict = dict() for user in self.__user_ix_dict: user_label_dict[user] = labels[self.__user_ix_dict[user]] return user_label_dict
def test_maxabsscaler_vs_sklearn(): # Compare msmbuilder.preprocessing.MaxAbsScaler # with sklearn.preprocessing.MaxAbsScaler maxabsscalerr = MaxAbsScalerR() maxabsscalerr.fit(np.concatenate(trajs)) maxabsscaler = MaxAbsScaler() maxabsscaler.fit(trajs) y_ref1 = maxabsscalerr.transform(trajs[0]) y1 = maxabsscaler.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
def plotPCA(X_train, y_train, X_test, y_test, outdir): #clf = loadClf(term, fold, clfName) #try: # decision = clf.decision_function # Vf = numpy.arange(-1.,1.1,0.1) # V = (0.,) #except AttributeError: # decision = lambda x:clf.predict_proba(x)[:,0] # Vf = numpy.arange(0.,1.05,0.05) # V = (0.5,) scaler = MaxAbsScaler(copy=False) target_names = ("Positive","Negative") term = outdir.parent.name.replace("_", " ") pca = PCA(n_components=2) pca.fit(X_train) scaler.fit(pca.transform(X_train)) #delta = 0.025 #a=numpy.arange(-1., 1., delta) #b=numpy.arange(-1., 1., delta) #A,B = numpy.meshgrid(a,b) #C=numpy.empty(A.shape) for X, y, n in ((X_train, y_train, 'training'), (X_test, y_test, 'testing')): X_r = scaler.transform(pca.transform(X)) inlier = (numpy.abs(X_r[:,0]) <= 1) & (numpy.abs(X_r[:,1]) <= 1) #print(X_r) plt.clf() #for k,l in product(range(len(a)),range(len(b))): # C[k][l] = decision(pca.inverse_transform(scaler.inverse_transform(((A[k][l],B[k][l]),)))) #print(C) #cfp = plt.contourf(A,B,C,Vf,cmap=plt.cm.bone) #cfp.cmap.set_under('black') #cfp.cmap.set_over('white') #plt.contour(A,B,C,V,colors=("b",)) #y=clf.predict(X) for c, i, target_name in zip("rg", (0, 1), target_names): plt.scatter(X_r[(y == i) & inlier, 0], X_r[(y == i) & inlier, 1], c = c, label = target_name, marker = ",", s = 1,#0.8,#1/numpy.sqrt(2), #edgecolors='none', linewidth = 0, alpha = 0.7) plt.legend() plt.title('PCA for %s on %s data' % (term, n)) plt.savefig(str(outdir/('pca-%s.png' % (n,)))) plt.savefig(str(outdir/('pca-%s.ps' % (n,))))
def _train_test_split(): # Build the store_weather dataframe store_weather_filename = Config.save_dir + "store_weather.pkl" if os.path.exists(store_weather_filename): store_weather = utils.from_pickle(store_weather_filename) else: store_weather = _preprocess_data() # Split train test for each store train = pd.DataFrame({}) test = pd.DataFrame({}) store_ids = store_weather.store_id_bk.unique() for sid in store_ids: c_store = store_weather[store_weather.store_id_bk == sid] s_train = c_store[:-Config.test_size] s_test = c_store[-Config.test_size:] train = train.append(s_train).reset_index().drop(["index"], axis=1) test = test.append(s_test).reset_index().drop(["index"], axis=1) # Scale numeric columns num_cols = ["p_total_revenue", "p_total_volume", "mean_temp", "total_precipitation", "total_snow"] scaler = MaxAbsScaler().fit(train.loc[:, num_cols]) train.loc[:, num_cols] = scaler.transform(train.loc[:, num_cols]) test.loc[:, num_cols] = scaler.transform(test.loc[:, num_cols]) # Scale 2 output columns revenue_scale = MaxAbsScaler().fit(train.loc[:, ["total_revenue"]]) volume_scale = MaxAbsScaler().fit(train.loc[:, ["total_volume"]]) train.loc[:, ["total_revenue"]] = revenue_scale.transform( train.loc[:, ["total_revenue"]]) test.loc[:, ["total_revenue"]] = revenue_scale.transform( test.loc[:, ["total_revenue"]]) train.loc[:, ["total_volume"]] = volume_scale.transform( train.loc[:, ["total_volume"]]) test.loc[:, ["total_volume"]] = volume_scale.transform( test.loc[:, ["total_volume"]]) # Save the train/test dataframes to pickle objects utils.to_pickle(Config.save_dir + "train_set.pkl", train) utils.to_pickle(Config.save_dir + "test_set.pkl", test) # Save the 2 scaler for later use utils.to_pickle(Config.save_dir + "revenue_scale", revenue_scale) utils.to_pickle(Config.save_dir + "volume_scale", volume_scale) # Save store_ids utils.to_pickle(Config.save_dir + "store_id.pkl", store_ids) return train, test
def scale_data(x_train, x_test): """ We only scale the continuous features. No need to scale binary features """ idx_binary = [] # columns with boolean values for k in range(x_train.shape[1]): idx_binary.append( np.array_equal(x_train[:,k], x_train[:,k].astype(bool)) ) # checking if a column is binary idx_cont = np.logical_not(idx_binary) sc = MaxAbsScaler() sc.fit(x_train[:, idx_cont]) x_train[:, idx_cont] = sc.transform(x_train[:, idx_cont]) x_test[:, idx_cont] = sc.transform(x_test[:, idx_cont]) return
def impute_and_scale(df, scaling=None): """Impute missing values with mean and scale data included in pandas dataframe. Parameters ---------- df : pandas dataframe dataframe to impute and scale scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std') type of scaling to apply """ df = df.dropna(axis=1, how='all') imputer = Imputer(strategy='mean', axis=0) mat = imputer.fit_transform(df) # print(mat.shape) if scaling is None: return pd.DataFrame(mat, columns=df.columns) # Scaling data if scaling == 'maxabs': # Normalizing -1 to 1 scaler = MaxAbsScaler() elif scaling == 'minmax': # Scaling to [0,1] scaler = MinMaxScaler() else: # Standard normalization scaler = StandardScaler() mat = scaler.fit_transform(mat) # print(mat.shape) df = pd.DataFrame(mat, columns=df.columns) return df
import pandas as pd from sklearn.cross_validation import train_test_split from sklearn.preprocessing import MaxAbsScaler from sklearn.neighbors import KNeighborsClassifier # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR') training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25) result1 = tpot_data.copy() # Use Scikit-learn's MaxAbsScaler to scale the features training_features = result1.loc[training_indices].drop('class', axis=1) if len(training_features.columns.values) > 0: scaler = MaxAbsScaler() scaler.fit(training_features.values.astype(np.float64)) scaled_features = scaler.transform(result1.drop('class', axis=1).values.astype(np.float64)) result1 = pd.DataFrame(data=scaled_features) result1['class'] = result1['class'].values else: result1 = result1.copy() # Perform classification with a k-nearest neighbor classifier knnc2 = KNeighborsClassifier(n_neighbors=min(10, len(training_indices))) knnc2.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values) result2 = result1.copy() result2['knnc2-classification'] = knnc2.predict(result2.drop('class', axis=1).values)
# Load dataset: path = '/media/DATA/tmp/datasets/regionais/meteo_regions/csv_regions/TAG/yearly/' file = 'yearly_clip_R1_OK_TAG.csv' df = pd.read_csv(os.path.join(path, file), sep=',', decimal='.') # Split into input (X) and output (Y) variables: df2 = df[['36V', '89V', '166V', '190V']] #x = df2.reindex(columns=cols) x = df2[['36V', '89V', '166V', '190V']] y = df[['TagRain']] # Scaling the input paramaters: scaler_min_max = MinMaxScaler() x_minmax = scaler_min_max.fit_transform(x) scaler_abs_max = MaxAbsScaler() x_abs_max = scaler_abs_max.fit_transform(x) stand_sc = StandardScaler() x_stand_sc = stand_sc.fit_transform(x) norm_sc = Normalizer() x_norm = norm_sc.fit_transform(x) x_power_box = PowerTransformer(method='box-cox').fit_transform(x) x_power_yeo = PowerTransformer(method='yeo-johnson').fit_transform(x) x_quantil = QuantileTransformer(output_distribution = 'uniform').fit_transform(x)
# ; 'P17','P18','P19','P20','P21','P22','P23','P24', # ; 'P25','P26','P27','P28','P29','P30','P31','P32', # ; 'P33','P34','P35','P36','P37','P38','P39','P40', # ; 'P41','P42','P43','P44','P45','P46','P47','P48', # ; 'P49','P50','P51','P52','P53','P54','P55','P56', # ; 'P57','P58'], header=None) print(df_train.head()) df_train = pd.get_dummies(df_train) df_train = df_train.fillna(df_train.mean()) y = df_train['P29'].values X_train = df_train[['P0','P1','P2','P3','P4','P5','P6','P7','P8','P9','P10','P11','P12','P13',\ 'P14','P15','P16','P17','P18','P19','P20','P21','P22','P23','P24','P25','P26','P27','P28']] # y = df_train['P58'].values # X_train = df_train[['P0','P1','P2','P3','P4','P5','P6','P7','P8','P9','P10','P11','P12','P13',\ # 'P14','P15','P16','P17','P18','P19','P20','P21','P22','P23','P24','P25','P26', # 'P27','P28','P29','P30','P31','P32','P33','P34','P35','P36','P37','P38','P39', # 'P40','P41','P42','P43','P44','P45','P46','P47','P48','P49','P50','P51','P52', # 'P53','P54','P55','P56','P57']] X_train = MaxAbsScaler().fit_transform(X_train) seed = 30 np.random.seed(seed) X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.064, random_state=seed) model = load_network(filename) evaluated(model,X_test, y_test)
def masked_randomForest(X_train, y_train, preprocess='Std'): # PREPROCESSING = FEATURES SCALING if preprocess == 'MaxMin': preprocessing = MaxAbsScaler() preprocessing.fit(X_train) X_train = preprocessing.transform(X_train) print 'preprocess %s completed' % (preprocess) if preprocess == 'Binarization': preprocessing = Binarizer() preprocessing.fit(X_train) X_train = preprocessing.transform(X_train) print 'preprocess %s completed' % (preprocess) if preprocess == 'Std': preprocessing = StandardScaler(with_mean=False) preprocessing.fit(X_train) X_train = preprocessing.transform(X_train) print 'preprocess %s completed' % (preprocess) if preprocess == 'full_std': preprocessing = StandardScaler() X_train = preprocessing.fit_transform(X_train.toarray()) print 'preprocess %s completed' % (preprocess) if preprocess == 'norm': X_train = normalize(X_train.toarray(), axis=0, norm='l1') print 'preprocess %s completed' % (preprocess) clf = RandomForestClassifier(n_jobs=-1, n_estimators=50) clf.fit(X_train, y_train) importances = clf.feature_importances_ inter = [np.percentile(importances, qq) for qq in np.linspace(0, 100, 6)] count = 0 RF_features = np.zeros((1, len(importances))) for low, high in zip(inter[:len(inter)], inter[1:]): if low == -1: RF_features += count * np.logical_and(importances <= high, importances >= low) else: RF_features += count * np.logical_and(importances <= high, importances > low) count += 1 RF_features = RF_features.astype(int) importances.sort() fig, ax1 = plt.subplots(1) x = np.arange(len(importances)) ax1.plot(x, importances[::-1], 'b-') ax1.set_xlabel('features') # Make the y-axis label, ticks and tick labels match the line color. ax1.set_ylabel('Random Forest importance', color='b') ax1.tick_params('y', colors='b') ax1.set_yscale('log') fig.tight_layout() plt.title(' RandomForest importance ') plt.show() return RF_features
class Simulation(object): """Class glueing all the pieces together. Performs whole simulation. :param dataset: Dataset which extends :py:obj:`mutabledataset.SimMixin` :param AgentCl: Class defining agent behavior, namely `benefit` and `cost`. :param learner: Class defining learner behavior, namely `fit` and `predict`. :param split: Defines portion used for fitting the learner. Rest is used for determining `eps` value, regarding the epsilon equilibrium. Simulation is done on the whole dataset. :param cost_distribution: Passed on to AgentTransformer. :param cost_distribution_dep: Passed on to AgentTransformer. :param no_neighbors: Passed on to AgentTransformer. :param max_it: Passed on to AgentTransformer. :param collect_incentive_data: Passed on to AgentTransformer. """ def __init__(self, dataset, AgentCl, learner, cost_distribution, split=[0.5], collect_incentive_data=False, no_neighbors=60, cost_distribution_dep=None, max_it=130): self.dataset = dataset self.no_neighbors = no_neighbors self.cost_distribution = cost_distribution self.max_it = max_it self.learner = learner self.split = split self.AgentCl = AgentCl self.collect_incentive_data = collect_incentive_data self.cost_distribution_dep = cost_distribution_dep def no_classes(self, dataset): """ :param dataset: Some AIF360 dataset :returns: Number of distinct labels (classes) """ return len(set(dataset.labels.ravel())) def start_simulation(self, runs=1, scale=True): """ :param runs: Run simulation multiple times with the same parameters :param scale: Perform scaling on dataset features. :returns: Modified dataset including new ground truth labels :rtype: :py:obj:`simulation.SimulationResultSet` """ res_list = [] for i in range(runs): res_list.append(self._simulate(scale)) return SimulationResultSet(res_list, runs=runs) def _simulate(self, scale): """ Private entrypoint to perform a single simulation :param scale: Perform scaling on dataset features :returns: Modified dataset including new ground truth labels :rtype: :py:obj:`simulation.SimulationResult` """ self.scaler = MaxAbsScaler() dataset = self.dataset.copy(deepcopy=True) # we need at least one example for each class in each of the two splits while True: train, test = dataset.split(self.split, shuffle=False) break if self.no_classes(train) >= 2 and self.no_classes(test) >= 2: break train_indices = list(map(int, train.instance_names)) test_indices = list(map(int, test.instance_names)) self.train, self.test = train, test if scale: train.features = self.scaler.fit_transform(train.features) test.features = self.scaler.transform(test.features) dataset.features = self.scaler.transform(dataset.features) dataset.infer_domain() # learner moves self.learner.fit(train) ft_names = dataset.protected_attribute_names ft_indices = list( map(lambda x: not x in ft_names, dataset.feature_names)) self.Y_predicted = self.learner.predict(dataset.features) self.Y_predicted_pr = self.learner.predict_proba(dataset.features) # agents move at = AgentTransformer( self.AgentCl, self.learner, self.cost_distribution, collect_incentive_data=self.collect_incentive_data, no_neighbors=self.no_neighbors, cost_distribution_dep=self.cost_distribution_dep, max_it=self.max_it) dataset_ = at.transform(dataset) train_ = utils.dataset_from_matrix( np.hstack((dataset_.features[train_indices, :], dataset_.labels[train_indices])), dataset) test_ = utils.dataset_from_matrix( np.hstack((dataset_.features[test_indices, :], dataset_.labels[test_indices])), dataset) acc_h = self.learner.accuracy(test) # update changed features #dataset_ = dataset_from_matrix(np.hstack((np.vstack((train_.features, test_.features)), np.vstack((train_.labels, test_.labels)))), dataset) self.Y_new_predicted = self.learner.predict(dataset_.features) self.Y_new_predicted_pr = self.learner.predict_proba(dataset_.features) acc_h_post = self.learner.accuracy(test_) # fit data again, see if accuracy changes self.learner.fit(train_) acc_h_star_post = self.learner.accuracy(test_) # construct datasets for features # including predicted label if scale: dataset.features = self.scaler.inverse_transform(dataset.features) dataset_df = dataset.convert_to_dataframe(de_dummy_code=True)[0] dataset_df['credit_h'] = pd.Series(self.Y_predicted, index=dataset_df.index) dataset_df['credit_h_pr'] = pd.Series(self.Y_predicted_pr, index=dataset_df.index) if scale: dataset_.features = self.scaler.inverse_transform( dataset_.features) dataset_new_df = dataset_.convert_to_dataframe(de_dummy_code=True)[0] dataset_new_df['credit_h'] = pd.Series(self.Y_new_predicted, index=dataset_new_df.index) dataset_new_df['credit_h_pr'] = pd.Series(self.Y_new_predicted_pr, index=dataset_new_df.index) res = SimulationResult() res.df = dataset_df res.df_new = dataset_new_df res.eps = abs(acc_h_star_post - acc_h_post) res.acc_h = acc_h res.acc_h_post = acc_h_post res.acc_h_star_post = acc_h_star_post res.incentives = at.incentives return res
import numpy as np import scipy.sparse as sp from keras import backend as K from sklearn.metrics import roc_auc_score as auc_score from sklearn.metrics import average_precision_score as ap_score from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler from utils import generate_data, batch_data, compute_masked_accuracy from utils_gcn import load_citation_data, split_citation_data from ae_LPNC import autoencoder_multitask # 以'citeseer'数据集为例,讨论多任务过程 dataset = 'citeseer' print('\nLoading dataset {:s}...\n'.format(dataset)) adj, feats, y_train, y_val, y_test, mask_train, mask_val, mask_test = load_citation_data( dataset) feats = MaxAbsScaler().fit_transform(feats).tolil() train = adj.copy() test_inds = split_citation_data(adj) test_inds = np.vstack({tuple(row) for row in test_inds}) test_r = test_inds[:, 0] test_c = test_inds[:, 1] labels = [] labels.extend(np.squeeze(adj[test_r, test_c].toarray())) labels.extend(np.squeeze(adj[test_c, test_r].toarray())) multitask = True if multitask: # If multitask, simultaneously perform link prediction and # semi-supervised node classification on incomplete graph with # 10% held-out positive links and same number of negative links.
def getviz_kpca(X, y, figPath, fig_prefix='KPCA_viz'): # this is a non optimized visualization ! Just for thoughts preprocessing = MaxAbsScaler() X_train = preprocessing.fit_transform(X) print 'preprocessing MaxAbs done' os.chdir(figPath) reds = y == 0 blues = y == 1 kernels = ['cosine', 'rbf', 'regular'] gammas = [1e-4, 1e-3, 1e-2] for k in kernels: if k == 'rbf': for g in gammas: plt.figure() kpca = KernelPCA(kernel=k, gamma=g, n_components=2, n_jobs=-1) X_kpca = kpca.fit_transform(X_train) plt.plot(X_kpca[reds, 0], X_kpca[reds, 1], "ro", label='csp-') plt.plot(X_kpca[blues, 0], X_kpca[blues, 1], "bo", label='csp+') plt.title("Projection by PCA with %s kernel, gamma = %f" % (k, g)) plt.xlabel("1st principal component") plt.ylabel("2nd component") plt.legend(loc="lower right", prop={'size': 6}) plt.savefig('img/' + fig_prefix + k + 'gamma_' + str(g)) print 'rbf PCA done' elif k == 'regular': plt.figure() kpca = PCA() X_kpca = kpca.fit_transform(X_train) plt.plot(X_kpca[reds, 0], X_kpca[reds, 1], "ro", label='csp-') plt.plot(X_kpca[blues, 0], X_kpca[blues, 1], "bo", label='csp+') plt.title("Projection by PCA") plt.xlabel("1st principal component") plt.ylabel("2nd component") plt.legend(loc="lower right", prop={'size': 6}) plt.savefig('img/' + fig_prefix + k) plt.figure() plt.plot(kpca.explained_variance_, linewidth=2) plt.xlabel('n_components') plt.ylabel('explained_variance_') plt.title("Projection by PCA") plt.savefig('img/' + fig_prefix + k + 'explained_variance') print 'PCA done' elif k == 'cosine': plt.figure() kpca = KernelPCA(kernel=k, n_components=2, n_jobs=-1) X_kpca = kpca.fit_transform(X_train) plt.plot(X_kpca[reds, 0], X_kpca[reds, 1], "ro", label='csp-') plt.plot(X_kpca[blues, 0], X_kpca[blues, 1], "bo", label='csp+') plt.title("Projection by PCA with %s kernel" % (k)) plt.xlabel("1st principal component") plt.ylabel("2nd component") plt.legend(loc="lower right", prop={'size': 6}) plt.savefig('img/' + fig_prefix + k) print 'consine PCA done'
def find_best_solution(train_filename): unscaled_dist = extract_dist(train_filename) # unscaled_dist.info() unscaled_dist = clean_distribution(unscaled_dist) # unscaled_dist.info() # standardised_csv.to_csv('scaledData.csv', index=False) preprocessors = { 'StandardScaler': StandardScaler(), # The below remove anomalies 'RobustScaler': RobustScaler(), 'PowerTransformer(method="yeo-johnson")': PowerTransformer(method='yeo-johnson'), 'QuantileTransformer(output_distribution="normal")': QuantileTransformer(output_distribution="normal"), 'QuantileTransformer(output_distribution="uniform")': QuantileTransformer(output_distribution="uniform"), 'MinMaxScaler': MinMaxScaler(), 'MaxAbsScaler': MaxAbsScaler(), 'Normalizer': Normalizer(), } distributions = { 'unscaled': unscaled_dist, 'StandardScaler': scale_distribution(unscaled_dist, StandardScaler()), 'RobustScaler': scale_distribution(unscaled_dist, RobustScaler()), 'PowerTransformer(method="yeo-johnson")': scale_distribution(unscaled_dist, PowerTransformer(method='yeo-johnson')), 'QuantileTransformer(output_distribution="normal")': scale_distribution(unscaled_dist, QuantileTransformer(output_distribution="normal")), 'QuantileTransformer(output_distribution="uniform")': scale_distribution(unscaled_dist, QuantileTransformer(output_distribution="uniform")), 'MinMaxScaler': scale_distribution(unscaled_dist, MinMaxScaler()), 'MaxAbsScaler': scale_distribution(unscaled_dist, MaxAbsScaler()), 'Normalizer': scale_distribution(unscaled_dist, Normalizer()), } classifiers = { # Standard classifiers 'KNeighborsClassifier': { 'configurable': lambda configuration: KNeighborsClassifier(n_neighbors= configuration), 'score_function': get_classifier_score_knn, }, 'LinearSVC': { 'configurable': lambda configuration: LinearSVC(max_iter=configuration), 'score_function': get_classifier_score_linear_svc, }, 'LogisticRegression': { 'configurable': lambda configuration: LogisticRegression(solver=configuration), 'score_function': get_classifier_score_logistic_regression, }, # DL # 'Sequential': { # # 'configurable': # TODO get this working # 'score_function': get_model_score_sequential, # }, } test_sizes = [0.2, 0.25, 0.3] # test_sizes = [0.2] results_of_all_configurations = [] log(blue(f'\n############# Trying Configurations #############')) for dist_name, dist in distributions.items(): log(bold(f'\n{dist_name}')) X, y = get_x_matrix_and_y_vector( dist ) # set up X matrix and y vectors for the test and training sets for test_size in test_sizes: log(f' test_size = {test_size}') train_test_data = (X, y, test_size) for classifier_name, classifier in classifiers.items(): log(f' {classifier_name}') configuration_results, classifier_average_score = try_configuration( train_test_data, classifier) for result in configuration_results: result['preprocessor_name'] = dist_name result['test_size'] = test_size result['classifier_name'] = classifier_name results_of_all_configurations.extend(configuration_results) results_of_all_configurations.sort( key=lambda result: result['configuration_score'], reverse=True, ) logTopConfigurations(results_of_all_configurations, 10) return extract_best_solution(preprocessors, classifiers, results_of_all_configurations)
class ParallelCoordinates(DataVisualizer): """ Parallel coordinates displays each feature as a vertical axis spaced evenly along the horizontal, and each instance as a line drawn between each individual axis. Parameters ---------- ax : matplotlib Axes, default: None The axis to plot the figure on. If None is passed in the current axes will be used (or generated if required). features : list, default: None a list of feature names to use If a DataFrame is passed to fit and features is None, feature names are selected as the columns of the DataFrame. classes : list, default: None a list of class names for the legend If classes is None and a y value is passed to fit then the classes are selected from the target vector. normalize : string or None, default: None specifies which normalization method to use, if any Current supported options are 'minmax', 'maxabs', 'standard', 'l1', and 'l2'. sample : float or int, default: 1.0 specifies how many examples to display from the data If int, specifies the maximum number of samples to display. If float, specifies a fraction between 0 and 1 to display. color : list or tuple, default: None optional list or tuple of colors to colorize lines Use either color to colorize the lines on a per class basis or colormap to color them on a continuous scale. colormap : string or cmap, default: None optional string or matplotlib cmap to colorize lines Use either color to colorize the lines on a per class basis or colormap to color them on a continuous scale. vlines : boolean, default: True flag to determine vertical line display vlines_kwds : dict, default: None options to style or display the vertical lines, default: None kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. Examples -------- >>> visualizer = ParallelCoordinates() >>> visualizer.fit(X, y) >>> visualizer.transform(X) >>> visualizer.poof() Notes ----- These parameters can be influenced later on in the visualization process, but can and should be set as early as possible. """ normalizers = { 'minmax': MinMaxScaler(), 'maxabs': MaxAbsScaler(), 'standard': StandardScaler(), 'l1': Normalizer('l1'), 'l2': Normalizer('l2'), } def __init__(self, ax=None, features=None, classes=None, normalize=None, sample=1.0, color=None, colormap=None, vlines=True, vlines_kwds=None, **kwargs): super(ParallelCoordinates, self).__init__( ax, features, classes, color, colormap, **kwargs ) # Validate 'normalize' argument if normalize in self.normalizers or normalize is None: self.normalize = normalize else: raise YellowbrickValueError( "'{}' is an unrecognized normalization method" .format(normalize) ) # Validate 'sample' argument if isinstance(sample, int): if sample < 1: raise YellowbrickValueError( "`sample` parameter of type `int` must be greater than 1" ) elif isinstance(sample, float): if sample <= 0 or sample > 1: raise YellowbrickValueError( "`sample` parameter of type `float` must be between 0 and 1" ) else: raise YellowbrickTypeError( "`sample` parameter must be int or float" ) self.sample = sample # Visual Parameters self.show_vlines = vlines self.vlines_kwds = vlines_kwds or { 'linewidth': 1, 'color': 'black' } def draw(self, X, y, **kwargs): """ Called from the fit method, this method creates the parallel coordinates canvas and draws each instance and vertical lines on it. """ # Convert from dataframe if is_dataframe(X): X = X.as_matrix() # Choose a subset of samples # TODO: allow selection of a random subset of samples instead of head if isinstance(self.sample, int): self.n_samples = min([self.sample, len(X)]) elif isinstance(self.sample, float): self.n_samples = int(len(X) * self.sample) X = X[:self.n_samples, :] # Normalize if self.normalize is not None: X = self.normalizers[self.normalize].fit_transform(X) # Get the shape of the data nrows, ncols = X.shape # Create the xticks for each column # TODO: Allow the user to specify this feature x = list(range(ncols)) # Create the colors # TODO: Allow both colormap, listed colors, and palette definition # TODO: Make this an independent function or property for override! color_values = resolve_colors( n_colors=len(self.classes_), colormap=self.colormap, colors=self.color ) colors = dict(zip(self.classes_, color_values)) # Track which labels are already in the legend used_legends = set([]) # TODO: Make this function compatible with DataFrames! # TODO: Make an independent function to allow addition of instances! for idx, row in enumerate(X): # TODO: How to map classmap to labels? label = y[idx] # Get the label for the row label = self.classes_[label] if label not in used_legends: used_legends.add(label) self.ax.plot(x, row, color=colors[label], alpha=0.25, label=label, **kwargs) else: self.ax.plot(x, row, color=colors[label], alpha=0.25, **kwargs) # Add the vertical lines # TODO: Make an independent function for override! if self.show_vlines: for idx in x: self.ax.axvline(idx, **self.vlines_kwds) # Set the limits self.ax.set_xticks(x) self.ax.set_xticklabels(self.features_) self.ax.set_xlim(x[0], x[-1]) def finalize(self, **kwargs): """ Finalize executes any subclass-specific axes finalization steps. The user calls poof and poof calls finalize. Parameters ---------- kwargs: generic keyword arguments. """ # Set the title self.set_title( 'Parallel Coordinates for {} Features'.format(len(self.features_)) ) # Set the legend and the grid self.ax.legend(loc='best') self.ax.grid()
article = df.loc['Cristiano Ronaldo'] # Compute the dot products: similarities similarities = df.dot(article) # Display those with the largest cosine similarity print(similarities.nlargest()) # ===================== # # ==== Ejercicio 30 === # # ===================== # # Perform the necessary imports from sklearn.decomposition import NMF from sklearn.preprocessing import Normalizer, MaxAbsScaler from sklearn.pipeline import make_pipeline # Create a MaxAbsScaler: scaler scaler = MaxAbsScaler() # Create an NMF model: nmf nmf = NMF(n_components = 20) # Create a Normalizer: normalizer normalizer = Normalizer() # Create a pipeline: pipeline pipeline = make_pipeline(scaler, nmf, normalizer) # Apply fit_transform to artists: norm_features norm_features = pipeline.fit_transform(artists) # ===================== # # ==== Ejercicio 31 === # # ===================== # # Import pandas import pandas as pd
#Import Libraries from sklearn.datasets import make_regression from sklearn.preprocessing import MaxAbsScaler #---------------------------------------------------- ''' work on columns class sklearn.preprocessing.MaxAbsScaler(copy=True) ''' # ---------------------------------------------------- # MaxAbsScaler Data X, y = make_regression(n_samples=500, n_features=3, shuffle=True) X = X * 100 # showing data print('X \n', X[:5]) print('y \n', y[:5]) scaler = MaxAbsScaler(copy=True) X = scaler.fit_transform(X) #showing data print('X \n', X[:5]) print('y \n', y[:5])
# %% [code] train_data.info() # %% [code] train_data.hist(bins=10) # %% [code] from sklearn.preprocessing import MaxAbsScaler, RobustScaler, MinMaxScaler cols = ["Fare"] X = train_data.drop(["Survived"], axis=1) fare_scaled_r = RobustScaler().fit(X[cols]) X[cols] = fare_scaled_r.transform(X[cols]) fare_scaled_ma = MaxAbsScaler().fit(X[cols]) #X[cols] = fare_scaled_ma.transform(X[cols]) # %% [code] X.head() # %% [code] y = train_data["Survived"] # %% [code] X.shape, y.shape # %% [code] from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.85)
weather[weather["maxWd"].isnull() == True] weather["maxWd"].fillna(method='bfill', limit=1, inplace=True) std_weather = pd.DataFrame(StandardScaler().fit_transform( weather.drop(columns="tm")), columns=weather.drop(columns="tm").columns) std_weather["tm"] = weather["tm"] mm_weather = pd.DataFrame(MinMaxScaler().fit_transform( weather.drop(columns="tm")), columns=weather.drop(columns="tm").columns) mm_weather["tm"] = weather["tm"] ma_weather = pd.DataFrame(MaxAbsScaler().fit_transform( weather.drop(columns="tm")), columns=weather.drop(columns="tm").columns) ma_weather["tm"] = weather["tm"] rb_weather = pd.DataFrame(RobustScaler().fit_transform( weather.drop(columns="tm")), columns=weather.drop(columns="tm").columns) rb_weather["tm"] = weather["tm"] feature_name = [ "avgTa", "sumRn", "avgWs", "maxWd", "avgTd", "minRhm", "sumGsr", "avgTs" ] def makeDecisionTree(citerion, x_train, y_train, depthNum): if depthNum > 0:
from sklearn.cluster import FeatureAgglomeration from sklearn.feature_selection import SelectFwe, SelectKBest, SelectPercentile, VarianceThreshold from sklearn.feature_selection import SelectFromModel, RFE from sklearn.ensemble import ExtraTreesClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import cross_val_predict from sklearn.metrics import accuracy_score, f1_score from tpot_metrics import balanced_accuracy_score from sklearn.pipeline import make_pipeline dataset = sys.argv[1] preprocessor_list = [ Binarizer(), MaxAbsScaler(), MinMaxScaler(), Normalizer(), PolynomialFeatures(), RobustScaler(), StandardScaler(), FastICA(), PCA(), RBFSampler(), Nystroem(), FeatureAgglomeration(), SelectFwe(), SelectKBest(), SelectPercentile(), VarianceThreshold(), SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=100)),
def hyperParamSearch(X_train, y_train, X_test, y_test, clf="logistic", scoring='accuracy', preprocess='MaxMin'): tuned_parameters = dict() if preprocess == 'MaxMin': preprocessing = ('MaxMin', MaxAbsScaler()) if preprocess == 'Binarization': preprocessing = ('Bin', Binarizer()) if clf == "logistic": #Parameters of pipelines can be set using ‘__’ separated parameter names: tuned_parameters = [{ 'logistic__penalty': ['l1', 'l2'], 'logistic__C': [0.000001, 0.00001, 0.0001, 0.005, 0.001, 0.05, 0.01], 'logistic__class_weight': [None, 'balanced'] }] pipe = Pipeline( steps=[preprocessing, ('logistic', LogisticRegression(n_jobs=-1))]) if clf == "randomForest": tuned_parameters = [{ 'randomForest__n_estimators': [100, 300, 500], 'randomForest__min_samples_leaf': [1, 2, 5, 10, 25, 50], 'randomForest__class_weight': [None, 'balanced'] }] pipe = Pipeline(steps=[ preprocessing, ('randomForest', RandomForestClassifier(n_jobs=-1)) ]) if clf == "KNN": tuned_parameters = [{ 'KNN__n_neighbors': [5, 10, 20, 40], 'KNN__weights': ['distance', 'uniform'], 'KNN__metric': ['euclidean', 'manhattan'] }] pipe = Pipeline( steps=[preprocessing, ('KNN', KNeighborsClassifier(n_jobs=-1))]) for score in scoring: estimator = GridSearchCV(pipe, tuned_parameters, cv=3, scoring=score, error_score=-1, n_jobs=-1) estimator.fit(X_train, y_train) save_name = "final_%s(%s based_%s preprocessed).pkl" % (clf, score, preprocess) # print information print("INFO: %s model (preprocessed by %s crossvalid based on %s)" % (clf, preprocess, score)) print("Best parameters set found on development set:") print(estimator.best_params_) print("%s scores on development set:" % (score)) means = estimator.cv_results_['mean_test_score'] stds = estimator.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, estimator.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) print("Detailed classification report:") print() print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") print() y_true, y_pred = y_test, estimator.predict(X_test) print(classification_report(y_true, y_pred)) print() joblib.dump(estimator, save_name, compress=True)
def hyperParamSearch_SMOTE(X_train, y_train, X_test, y_test, clf="logistic", scoring='accuracy', preprocess='MaxMin', method='agglo_custom'): sm = SMOTE(random_state=1, n_jobs=-1) X, y_train = sm.fit_sample(X_train.toarray(), y_train) X_train = csr_matrix(X) tuned_parameters = dict() if preprocess == 'MaxMin': preprocessing = ('MaxMin', MaxAbsScaler()) if preprocess == 'Binarization': preprocessing = ('Bin', Binarizer()) if clf == "logistic": # Parameters of pipelines can be set using ‘__’ separated parameter names: tuned_parameters = [{ 'logistic__penalty': ['l1', 'l2'], 'logistic__C': [0.0001, 0.001, 0.1, 1, 10], 'logistic__class_weight': [None, 'balanced'] }] if method == 'agglo_custom': tuned_parameters[0]['featuresaggregationscore__clusters'] = [ 50, 100, 200, 500 ] pipe = Pipeline(steps=[ preprocessing, ('featuresaggregationscore', FeaturesAggregationScore() ), ('logistic', LogisticRegression(n_jobs=-1)) ]) elif method == 'reduce_dim': tuned_parameters[0]['kernelpca__n_components'] = [ 50, 100, 200, 500 ] pipe = Pipeline(steps=[ preprocessing, ('kernelpca', KernelPCA(kernel='cosine', n_jobs=-1) ), ('logistic', LogisticRegression(n_jobs=-1)) ]) elif method == 'feat_select': fselect = SelectPercentile(chi2) tuned_parameters[0]['fselect__percentile'] = [20, 40, 60, 80] pipe = Pipeline(steps=[ preprocessing, ( 'fselect', fselect), ('logistic', LogisticRegression(n_jobs=-1)) ]) if clf == "randomForest": tuned_parameters = [{ 'randomForest__n_estimators': [100, 500], 'randomForest__min_samples_leaf': [1, 10, 25], 'randomForest__class_weight': [None, 'balanced'] }] if method == 'agglo_custom': tuned_parameters[0]['featuresaggregationscore__clusters'] = [ 50, 100, 200, 500 ] pipe = Pipeline(steps=[ preprocessing, ('featuresaggregationscore', FeaturesAggregationScore() ), ('randomForest', RandomForestClassifier(n_jobs=-1)) ]) elif method == 'reduce_dim': tuned_parameters[0]['kernelpca__n_components'] = [ 50, 100, 200, 500 ] pipe = Pipeline(steps=[ preprocessing, ('kernelpca', KernelPCA(kernel='cosine', n_jobs=-1) ), ('randomForest', RandomForestClassifier(n_jobs=-1)) ]) elif method == 'feat_select': fselect = SelectPercentile(chi2) tuned_parameters[0]['fselect__percentile'] = [20, 40, 60, 80] pipe = Pipeline(steps=[ preprocessing, ('fselect', fselect), ('randomForest', RandomForestClassifier(n_jobs=-1)) ]) for score in scoring: estimator = GridSearchCV(pipe, tuned_parameters, cv=3, scoring=score, error_score=-1, n_jobs=-1) estimator.fit(X_train, y_train) save_name = "final_%s(%s based_%s preprocessed).pkl" % (clf, score, preprocess) # print information print("INFO: %s model (preprocessed by %s crossvalid based on %s)" % (clf, preprocess, score)) print("Best parameters set found on development set:") print(estimator.best_params_) print("%s scores on development set:" % (score)) means = estimator.cv_results_['mean_test_score'] stds = estimator.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, estimator.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) print("Detailed classification report:") print() print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") print() y_true, y_pred = y_test, estimator.predict(X_test) print(classification_report(y_true, y_pred)) print() joblib.dump(estimator, save_name, compress=True)
input_scaler.fit(X) # transform training dataset X = input_scaler.transform(X) if output_scaler is not None: # reshape 1d arrays to 2d arrays y = y #.reshape(len(y), 1) # fit scaler on training dataset output_scaler.fit(y) # transform training dataset y = output_scaler.transform(y) return X, y ss = StandardScaler() mm = MinMaxScaler() mas = MaxAbsScaler() rs = RobustScaler(quantile_range=(25, 75)) pt1 = PowerTransformer(method="yeo-johnson") pt1 = PowerTransformer(method="box-cox") qt1 = QuantileTransformer(output_distribution="uniform") qt2 = QuantileTransformer(output_distribution="normal") n = Normalizer() X_none, y_none = get_dataset(None, None) X_ss, y_ss = get_dataset(ss, ss) X_mm, y_mm = get_dataset(mm, mm) X_mas, y_mas = get_dataset(mas, mas) X_rs, y_rs = get_dataset(rs, rs) X_pt1, y_pt1 = get_dataset(pt1, pt1) X_pt2, y_pt2 = get_dataset(pt2, pt2) X_qt1, y_qt1 = get_dataset(qt1, qt1)
# (9584, 22144) # ------------------------------------------------------------------------ # Normalize # generator 마지막에 activation이 tanh. # tanh을 거친 output 값이 -1~1 사이로 나오기 때문에 최대 1 최소 -1 로 맞춰줘야 한다. print(np.max(f_ds), np.min(f_ds)) # 3.8146973e-06 -80.0 from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler, RobustScaler scaler1 = StandardScaler() scaler1.fit(f_ds) f_ds = scaler1.transform(f_ds) scaler2 = MaxAbsScaler() scaler2.fit(f_ds) f_ds = scaler2.transform(f_ds) # 이 값이 -1 ~ 1 사아에 있는지 확인 print(np.max(f_ds), np.min(f_ds)) # 1.0 -1.0 # 비슷하게 맞춰 줌! # 기록용--------------------------------- # MaxAbsScaler # 4.7683717e-08 -1.0 # MinMaxScaler # 1.000001 0.0
def build_model(): """ Parameters ---------- None Returns ------- grid : model (GridSearchCV object) GridSearchCV object fitted on the training dataset using MaxAbsScaler and Ridge regression. X_test : ndarray Numpy array holding the feature matrix for the test set. y_test : ndarray Numpy array holding values for response variable for the test set. Notes ----- Creates and builds a machine learning model to predict the monthly rent of an apartment using only features that apply to pricing of an apartment that is not currently rented. I. Assumptions: 1. Market doesn't increase so the rent for a new tenant is the same as for the current tenant. 2. The features selected does not include current tenants/ occupant details, expenditures, out of pocket rents etc. 3. We have not included most of the continuous variables as they contain details related to occupied units and not vacant ones 4. Recode and Flag variables have not been considered II. Feature selection and generation The data consists of 15342 rows and 197 columns. The columns are a mix of categorical and continuous variables. Not all of them influence the rent. After careful analysis, only 91 columns - 87 categorical and 3 continuous were chosen, which are expected to influence the rent of a vacant apartment. The complete list of included variables is given in the .xls file. For all categorical variables, one hot encoding is performed. Missing/NA values are not imputed as their information is held by creating spearate binary columns for each of the values. 'uf17' is chosen as the response variable y. All rows for which response variable y is missing or above topcode value, are dropped. The final dataset contains 10138 rows and 430 columns. No imputation is necessary since the selected rows have no missing/NA values for the continuous variables. The dataset is then split into X(feature matrix) and y(response variable) and is split into a training and testing set in a 80:20 ratio. III. Model generation and selection Various linear models were tried on the dataset including Linear Regression, KNNRegressor, Lasso regression, Ridge regression, Elastic Net. Out of these Ridge regression gave the highest accuracy of 59.26% and was chosen to model this data. For training and modeling, pipelining was used to first scale the data using MaxAbsScaler. MaxAbsScaler was chosen since the data with its large number of binary columns has a large number of zero values and is sparse. GridSearchCV was then used to perform cross validation with 5 folds to determine the best alpha value for Ridge regression. The model with selected alpha value was then fit on the training dataset. """ df = pd.read_csv('data.csv') non_cat_index = [ 1, 32, 35, 36, 52, 54, 56, 58, 62, 72, 73, 82, 84, 85, 87, 89, 91, 92, 99, 141, 142, 143, 144, 145, 147, 149, 151, 153, 155, 157, 159, 161, 164, 165, 168, 169, 170 ] keep = list(range(2, 31)) + [40, 41, 45, 46, 47, 61, 63, 64] + list( range(66, 82)) + [83, 86, 88, 90] + list(range(92, 99)) + list( range(100, 116)) + [ 118, 126, 127, 128, 129, 130, 137, 138, 139, 140, 163 ] col_names = list(df.columns.values) cat_names = [] for i in range(0, 197): if (i + 1) not in non_cat_index: cat_names.append(col_names[i]) keep_index = [] for i in keep: keep_index.append(i - 1) keep_df = df[keep_index] cat_names_new = [] for name in cat_names: if name in list(keep_df.columns.values): cat_names_new.append(name) keep_df_exp = pd.get_dummies(keep_df, columns=cat_names_new) non_cat_names_in_keep_df = [] for name in list(keep_df.columns.values): if name not in cat_names_new: non_cat_names_in_keep_df.append(name) # removing all rows where rent is not applicable or given keep_df_exp['uf17'].replace([99999], [np.NaN], inplace=True) keep_df_exp['uf17'].replace([7999], [np.NaN], inplace=True) keep_df_exp_new = keep_df_exp[keep_df_exp.uf17.notnull()] #keep_df_exp_new is the expanded chosen columns after dropping all non applicable rent rows filter_df = keep_df_exp_new X, y = filter_df.loc[:, filter_df.columns != 'uf17'], filter_df.loc[:, 'uf17'] X = X.as_matrix() y = np.array(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) ridge_pipe = make_pipeline(MaxAbsScaler(), Ridge()) param_grid = {'ridge__alpha': np.logspace(.1, 1, 10)} grid = GridSearchCV(ridge_pipe, param_grid, cv=5) grid.fit(X_train, y_train) return grid, X_test, y_test
def dataset_segment(self): # 再写一个文件,存储slidWin的存储信息,比如每次训练集的长度 temp = None scaler_list = np.zeros((8, 1)) # 存放8通道缩放因子的矩阵 plt.figure() for i in range(8): U_T = np.loadtxt( os.path.join(self.target_addr, 'PCA_Martix', 'PCAM_Ch%d.out' % i)) # 加载主成分矩阵 20行3列 X = np.loadtxt( os.path.join(self.target_addr, 'slidWin_Data', 'slidWin_Ch%d.out' % i)) # 加载主成分矩阵 20行3列 pca_data = np.dot(X, U_T) X_ = np.loadtxt( os.path.join(self.target_addr, 'PCA_Data', 'X_%d.out' % i)) # 加载主成分矩阵 20行3列 # pca_r2 = r2_score(X[6500:8000, 0], X_[6500:8000, 0]) # print("The r2 value of Ch%d is %f" % (i, pca_r2)) # plt.subplot(8, 1, i+1) # plt.plot(X[:, 0]) # plt.plot(X_[:, 0], '--') """ 数据归一化,同一通道的所有主元采用第一主元的缩放因子 """ scaler = MaxAbsScaler() # 数据标准化 scaler.fit(pca_data[:, 0].reshape(-1, 1)) # 仅取第一主元的缩放因子,所有成分都用第一主元的缩放因子 scaler_list[i, 0] = scaler.scale_ pca_data = pca_data / scaler.scale_ if i is 0: temp = pca_data else: temp = np.hstack((temp, pca_data)) # 将数据进行水平合并 plt.plot(X[6500:8000, 0]) plt.plot(X_[6500:8000, 0], '--') plt.show() if os.path.isdir(os.path.join(self.target_addr, 'Scaler_Factors')) is False: # 建立归一化参数路径 os.mkdir(r'%s' % os.path.join( self.target_addr, 'Scaler_Factors')) # 目录需要一级一级建立,否则会找不到底层目录 np.savetxt( os.path.join(self.target_addr, 'Scaler_Factors', 'pca_layer_MAS_scale.out'), scaler_list) ''' 划分训练集和测试集 ''' if np.sum(self.report) != temp.shape[0]: print("警告!数据报告与数据库内容不符!") if np.size(self.report) == 1: x_train = temp[:, :] x_test = x_train else: x_train = temp[0:self.seg_boundary, :] x_test = temp[self.seg_boundary:, :] ''' 存储数据 ''' if os.path.isdir(os.path.join(self.target_addr, 'PCA_Data')) is False: # 路径不存在则新建 os.mkdir(os.path.join(self.target_addr, 'PCA_Data')) np.savetxt(os.path.join(self.target_addr, 'PCA_Data', 'pca_train.out'), x_train) np.savetxt(os.path.join(self.target_addr, 'PCA_Data', 'pca_test.out'), x_test)
# Import functional utilities from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler from sklearn.pipeline import FeatureUnion # Perform preprocessing get_text_data = FunctionTransformer(combine_text_columns, validate=False) get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False) # Create the token pattern: TOKENS_ALPHANUMERIC TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)' # Instantiate pipeline: pl pl = Pipeline([ ('union', FeatureUnion( transformer_list=[('numeric_features', Pipeline([('selector', get_numeric_data), ('imputer', Imputer())])), ('text_features', Pipeline([('selector', get_text_data), ('vectorizer', CountVectorizer( token_pattern=TOKENS_ALPHANUMERIC, ngram_range=(1, 2)) ), ('dim_red', SelectKBest(chi2, chi_k))]))])), ('scale', MaxAbsScaler()), ('clf', OneVsRestClassifier(LogisticRegression())) ])
def _simulate(self, scale): """ Private entrypoint to perform a single simulation :param scale: Perform scaling on dataset features :returns: Modified dataset including new ground truth labels :rtype: :py:obj:`simulation.SimulationResult` """ self.scaler = MaxAbsScaler() dataset = self.dataset.copy(deepcopy=True) # we need at least one example for each class in each of the two splits while True: train, test = dataset.split(self.split, shuffle=False) break if self.no_classes(train) >= 2 and self.no_classes(test) >= 2: break train_indices = list(map(int, train.instance_names)) test_indices = list(map(int, test.instance_names)) self.train, self.test = train, test if scale: train.features = self.scaler.fit_transform(train.features) test.features = self.scaler.transform(test.features) dataset.features = self.scaler.transform(dataset.features) dataset.infer_domain() # learner moves self.learner.fit(train) ft_names = dataset.protected_attribute_names ft_indices = list( map(lambda x: not x in ft_names, dataset.feature_names)) self.Y_predicted = self.learner.predict(dataset.features) self.Y_predicted_pr = self.learner.predict_proba(dataset.features) # agents move at = AgentTransformer( self.AgentCl, self.learner, self.cost_distribution, collect_incentive_data=self.collect_incentive_data, no_neighbors=self.no_neighbors, cost_distribution_dep=self.cost_distribution_dep, max_it=self.max_it) dataset_ = at.transform(dataset) train_ = utils.dataset_from_matrix( np.hstack((dataset_.features[train_indices, :], dataset_.labels[train_indices])), dataset) test_ = utils.dataset_from_matrix( np.hstack((dataset_.features[test_indices, :], dataset_.labels[test_indices])), dataset) acc_h = self.learner.accuracy(test) # update changed features #dataset_ = dataset_from_matrix(np.hstack((np.vstack((train_.features, test_.features)), np.vstack((train_.labels, test_.labels)))), dataset) self.Y_new_predicted = self.learner.predict(dataset_.features) self.Y_new_predicted_pr = self.learner.predict_proba(dataset_.features) acc_h_post = self.learner.accuracy(test_) # fit data again, see if accuracy changes self.learner.fit(train_) acc_h_star_post = self.learner.accuracy(test_) # construct datasets for features # including predicted label if scale: dataset.features = self.scaler.inverse_transform(dataset.features) dataset_df = dataset.convert_to_dataframe(de_dummy_code=True)[0] dataset_df['credit_h'] = pd.Series(self.Y_predicted, index=dataset_df.index) dataset_df['credit_h_pr'] = pd.Series(self.Y_predicted_pr, index=dataset_df.index) if scale: dataset_.features = self.scaler.inverse_transform( dataset_.features) dataset_new_df = dataset_.convert_to_dataframe(de_dummy_code=True)[0] dataset_new_df['credit_h'] = pd.Series(self.Y_new_predicted, index=dataset_new_df.index) dataset_new_df['credit_h_pr'] = pd.Series(self.Y_new_predicted_pr, index=dataset_new_df.index) res = SimulationResult() res.df = dataset_df res.df_new = dataset_new_df res.eps = abs(acc_h_star_post - acc_h_post) res.acc_h = acc_h res.acc_h_post = acc_h_post res.acc_h_star_post = acc_h_star_post res.incentives = at.incentives return res
def test_max_abs_scaler_onnx(self, rtol=1e-06, atol=1e-06): model = MaxAbsScaler() onnx_ml_pred, onnx_pred = self._test_scaler_converter(model) # Check that predicted values match np.testing.assert_allclose(onnx_ml_pred, onnx_pred, rtol=rtol, atol=atol)
def hyperParamSearch(X_train, y_train, X_test, y_test, clf_name="logistic", preprocess='Std', metric='euclidean'): # PREPROCESSING = FEATURES SCALING if preprocess == 'MaxMin': preprocessing = MaxAbsScaler() #preprocessing.fit(X_train) X_train = preprocessing.fit_transform(X_train) X_test = preprocessing.fit_transform(X_test) if preprocess == 'Binarization': preprocessing = Binarizer() #preprocessing.fit(X_train) X_train = preprocessing.fit_transform(X_train) X_test = preprocessing.fit_transform(X_test) print 'preprocess %s completed' % (preprocess) if preprocess == 'Std': preprocessing = StandardScaler(with_mean=False) #preprocessing.fit(X_train) X_train = preprocessing.fit_transform(X_train) X_test = preprocessing.fit_transform(X_test) print 'preprocess %s completed' % (preprocess) if preprocess == 'full_std': preprocessing = StandardScaler() X_train = preprocessing.fit_transform(X_train.toarray()) X_test = preprocessing.fit_transform(X_test.toarray()) print 'preprocess %s completed' % (preprocess) if preprocess == 'norm': X_train = normalize(X_train.toarray(), axis=0, norm='l1') print 'preprocess %s completed' % (preprocess) if clf_name == "logistic": params = 10**np.linspace(-8, 2, 13) N = len(params) HSV_tuples = [(x * 1.0 / N, 0.5, 0.5) for x in range(N)] RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples) print 'colors generation' i = 0 plt.figure() lineWidth = 2 for C in params: clf = LogisticRegression(n_jobs=-1, penalty='l2', C=C) clf.fit(X_train, y_train) print 'Logistic regression fitted : %d/%d' % (i + 1, N) ### Accuracy Part ### y_test_pred = clf.predict(X_test) confus = confusion_matrix(y_test, y_test_pred) print confus True_neg = confus[0, 0] True_pos = confus[1, 1] # capacite de detecter les csp+ sensitivity = True_pos * 1.0 / sum(confus[1, ::]) # capacite de detecter les csp- specificity = True_neg * 1.0 / sum(confus[0, ::]) accuracy = (True_pos + True_neg) * 1.0 / sum(sum(confus)) print 'accuracy : %s' % (accuracy) print 'sensitivity : %s' % (sensitivity) print 'specificity : %s' % (specificity) ### ROC Part ### proba = clf.predict_proba(X_test) ROC_scores = proba[:, 1] # thresholds for ROC fpr, tpr, _ = roc_curve(y_test, ROC_scores, pos_label=1) ROC_auc = auc(fpr, tpr) plt.plot(fpr, tpr, color=RGB_tuples[i], lw=lineWidth, label='parameters : C=' + str(C) + ' (area = %0.2f)' % ROC_auc) i += 1 joblib.dump(clf, clf_name + '_' + str(C), compress=True) plt.plot([0, 1], [0, 1], color='navy', lw=lineWidth, linestyle='--', label='Monkey') plt.xlim([0.0, 1.05]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curves for %s classifier with %s preprocessing' % (clf_name, preprocess)) plt.legend(loc="lower right", prop={'size': 6}) plt.show() print 'ROC curves done' if clf_name == "randomforest": params = [50, 100, 200] N = len(params) HSV_tuples = [(x * 1.0 / N, 0.5, 0.5) for x in range(N)] RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples) print 'colors generation' i = 0 plt.figure() lineWidth = 2 for nb in params: clf = RandomForestClassifier(n_jobs=-1, n_estimators=nb) clf.fit(X_train, y_train) print 'randomForest fitted : %d/%d' % (i + 1, N) ### Accuracy Part ### y_test_pred = clf.predict(X_test) confus = confusion_matrix(y_test, y_test_pred) print confus True_neg = confus[0, 0] True_pos = confus[1, 1] # capacite de detecter les csp+ sensitivity = True_pos * 1.0 / sum(confus[1, ::]) # capacite de detecter les csp- specificity = True_neg * 1.0 / sum(confus[0, ::]) accuracy = (True_pos + True_neg) * 1.0 / sum(sum(confus)) print 'accuracy : %s' % (accuracy) print 'sensitivity : %s' % (sensitivity) print 'specificity : %s' % (specificity) ### ROC Part ### proba = clf.predict_proba(X_test) ROC_scores = proba[:, 1] # thresholds for ROC fpr, tpr, _ = roc_curve(y_test, ROC_scores, pos_label=1) ROC_auc = auc(fpr, tpr) plt.plot(fpr, tpr, color=RGB_tuples[i], lw=lineWidth, label='parameters : N=' + str(nb) + ' (area = %0.2f)' % ROC_auc) i += 1 joblib.dump(clf, clf_name + '_' + str(nb), compress=True) plt.plot([0, 1], [0, 1], color='navy', lw=lineWidth, linestyle='--', label='Monkey') plt.xlim([0.0, 1.05]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curves for %s classifier with %s preprocessing' % (clf_name, preprocess)) plt.legend(loc="lower right", prop={'size': 6}) plt.show() print 'ROC curves done' if clf_name == "kNN": params = [10, 50, 75, 100] N = len(params) HSV_tuples = [(x * 1.0 / N, 0.5, 0.5) for x in range(N)] RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples) print 'colors generation' i = 0 plt.figure() lineWidth = 2 for k in params: clf = KNeighborsClassifier(n_jobs=-1, n_neighbors=k, metric=metric) clf.fit(X_train, y_train) print 'kNN fitted : %d/%d' % (i + 1, N) ### Accuracy Part ### y_test_pred = clf.predict(X_test) confus = confusion_matrix(y_test, y_test_pred) print confus True_neg = confus[0, 0] True_pos = confus[1, 1] # capacite de detecter les csp+ sensitivity = True_pos * 1.0 / sum(confus[1, ::]) # capacite de detecter les csp- specificity = True_neg * 1.0 / sum(confus[0, ::]) accuracy = (True_pos + True_neg) * 1.0 / sum(sum(confus)) print 'accuracy : %s' % (accuracy) print 'sensitivity : %s' % (sensitivity) print 'specificity : %s' % (specificity) ### ROC Part ### proba = clf.predict_proba(X_test) ROC_scores = proba[:, 1] # thresholds for ROC fpr, tpr, _ = roc_curve(y_test, ROC_scores, pos_label=1) ROC_auc = auc(fpr, tpr) plt.plot(fpr, tpr, color=RGB_tuples[i], lw=lineWidth, label='parameters : k=' + str(k) + ' (area = %0.2f)' % ROC_auc) i += 1 joblib.dump(clf, clf_name + '_' + str(k), compress=True) plt.plot([0, 1], [0, 1], color='navy', lw=lineWidth, linestyle='--', label='Monkey') plt.xlim([0.0, 1.05]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curves for %s classifier with %s preprocessing' % (clf_name, preprocess)) plt.legend(loc="lower right", prop={'size': 6}) plt.show() print 'ROC curves done' if clf_name == "naiveBayesM": params = [1.] N = len(params) HSV_tuples = [(x * 1.0 / N, 0.5, 0.5) for x in range(N)] RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples) print 'colors generation' i = 0 plt.figure() lineWidth = 2 for a in params: clf = MultinomialNB(alpha=a) clf.fit(X_train, y_train) print 'naiveBayes fitted : %d/%d' % (i + 1, N) ### Accuracy Part ### y_test_pred = clf.predict(X_test) confus = confusion_matrix(y_test, y_test_pred) print confus True_neg = confus[0, 0] True_pos = confus[1, 1] # capacite de detecter les csp+ sensitivity = True_pos * 1.0 / sum(confus[1, ::]) # capacite de detecter les csp- specificity = True_neg * 1.0 / sum(confus[0, ::]) accuracy = (True_pos + True_neg) * 1.0 / sum(sum(confus)) print 'accuracy : %s' % (accuracy) print 'sensitivity : %s' % (sensitivity) print 'specificity : %s' % (specificity) ### ROC Part ### proba = clf.predict_proba(X_test) ROC_scores = proba[:, 1] # thresholds for ROC fpr, tpr, _ = roc_curve(y_test, ROC_scores, pos_label=1) ROC_auc = auc(fpr, tpr) plt.plot(fpr, tpr, color=RGB_tuples[i], lw=lineWidth, label='parameters : a=' + str(a) + ' (area = %0.2f)' % ROC_auc) i += 1 joblib.dump(clf, clf_name + '_' + str(a), compress=True) plt.plot([0, 1], [0, 1], color='navy', lw=lineWidth, linestyle='--', label='Monkey') plt.xlim([0.0, 1.05]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curves for %s classifier with %s preprocessing' % (clf_name, preprocess)) plt.legend(loc="lower right", prop={'size': 6}) plt.show() print 'ROC curves done' if clf_name == "linSVM": params = 10**np.linspace(-5, 1, 6) N = len(params) HSV_tuples = [(x * 1.0 / N, 0.5, 0.5) for x in range(N)] RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples) print 'colors generation' i = 0 plt.figure() lineWidth = 2 for c in params: clf = LinearSVC(C=c, penalty='l2', loss='squared_hinge') clf.fit(X_train, y_train) print 'linSVM fitted : %d/%d' % (i + 1, N) proba = clf.predict_proba(X_test) ROC_scores = proba[:, 1] # thresholds for ROC fpr, tpr, _ = roc_curve(y_test, ROC_scores, pos_label=1) ROC_auc = auc(fpr, tpr) plt.plot(fpr, tpr, color=RGB_tuples[i], lw=lineWidth, label='parameters : C=' + str(c) + ' (area = %0.2f)' % ROC_auc) i += 1 joblib.dump(clf, clf_name + '_' + str(c), compress=True) plt.plot([0, 1], [0, 1], color='navy', lw=lineWidth, linestyle='--', label='Monkey') plt.xlim([0.0, 1.05]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curves for %s classifier with %s preprocessing' % (clf_name, preprocess)) plt.legend(loc="lower right", prop={'size': 6}) plt.show() print 'ROC curves done' if clf_name == "voting": params = ['soft', 'hard'] N = len(params) HSV_tuples = [(x * 1.0 / N, 0.5, 0.5) for x in range(N)] RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples) print 'colors generation' i = 0 plt.figure() lineWidth = 2 for v in params: clf1 = LogisticRegression(n_jobs=-1, penalty='l2', C=1e-6) clf2 = RandomForestClassifier(n_jobs=-1, n_estimators=50) #clf4 = MultinomialNB() #clf5 = KNeighborsClassifier(n_jobs=-1, n_neighbors = 10, metric = 'euclidean') #clf = VotingClassifier(estimators=[('lr', clf1), ('RF', clf2), ('mnb', clf4), ('kNN', clf5)], # voting=v) clf = VotingClassifier(estimators=[('lr', clf1), ('RF', clf2)], voting=v) clf.fit(X_train, y_train) print 'Voting fitted : %d/%d' % (i + 1, N) ### Accuracy Part ### y_test_pred = clf.predict(X_test) confus = confusion_matrix(y_test, y_test_pred) print confus True_neg = confus[0, 0] True_pos = confus[1, 1] # capacite de detecter les csp+ sensitivity = True_pos * 1.0 / sum(confus[1, ::]) # capacite de detecter les csp- specificity = True_neg * 1.0 / sum(confus[0, ::]) accuracy = (True_pos + True_neg) * 1.0 / sum(sum(confus)) print 'accuracy : %s' % (accuracy) print 'sensitivity : %s' % (sensitivity) print 'specificity : %s' % (specificity) proba = clf.predict_proba(X_test) ROC_scores = proba[:, 1] # thresholds for ROC fpr, tpr, _ = roc_curve(y_test, ROC_scores, pos_label=1) ROC_auc = auc(fpr, tpr) plt.plot(fpr, tpr, color=RGB_tuples[i], lw=lineWidth, label='parameters : v=' + str(v) + ' (area = %0.2f)' % ROC_auc) i += 1 joblib.dump(clf, clf_name + '_' + str(v), compress=True) plt.plot([0, 1], [0, 1], color='navy', lw=lineWidth, linestyle='--', label='Monkey') plt.xlim([0.0, 1.05]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curves for %s classifier with %s preprocessing' % (clf_name, preprocess)) plt.legend(loc="lower right", prop={'size': 6}) plt.show() print 'ROC curves done' if clf_name == "AdaBoost": params = [1, 2, 3, 4, 5] N = len(params) HSV_tuples = [(x * 1.0 / N, 0.5, 0.5) for x in range(N)] RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples) print 'colors generation' i = 0 plt.figure() lineWidth = 2 for m in params: clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=m), n_estimators=20) clf.fit(X_train, y_train) print 'Voting fitted : %d/%d' % (i + 1, N) ### Accuracy Part ### y_test_pred = clf.predict(X_test) confus = confusion_matrix(y_test, y_test_pred) print confus True_neg = confus[0, 0] True_pos = confus[1, 1] # capacite de detecter les csp+ sensitivity = True_pos * 1.0 / sum(confus[1, ::]) # capacite de detecter les csp- specificity = True_neg * 1.0 / sum(confus[0, ::]) accuracy = (True_pos + True_neg) * 1.0 / sum(sum(confus)) print 'accuracy : %s' % (accuracy) print 'sensitivity : %s' % (sensitivity) print 'specificity : %s' % (specificity) proba = clf.predict_proba(X_test) ROC_scores = proba[:, 1] # thresholds for ROC fpr, tpr, _ = roc_curve(y_test, ROC_scores, pos_label=1) ROC_auc = auc(fpr, tpr) plt.plot(fpr, tpr, color=RGB_tuples[i], lw=lineWidth, label='parameters : m=' + str(m) + ' (area = %0.2f)' % ROC_auc) i += 1 joblib.dump(clf, clf_name + '_' + str(m), compress=True) plt.plot([0, 1], [0, 1], color='navy', lw=lineWidth, linestyle='--', label='Monkey') plt.xlim([0.0, 1.05]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curves for %s classifier with %s preprocessing' % (clf_name, preprocess)) plt.legend(loc="lower right", prop={'size': 6}) plt.show() print 'ROC curves done' if clf_name == "xgb": cv_params = {'max_depth': [3, 5, 7], 'min_child_weight': [1, 3, 5]} ind_params = { 'learning_rate': 0.1, 'n_estimators': 1000, 'seed': 0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'objective': 'binary:logistic' } optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), cv_params, scoring='accuracy', cv=5, n_jobs=-1) optimized_GBM.fit(X_train, y_train) print optimized_GBM.grid_scores_ print 'Cross-validation !'
import pandas as pd from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler id = [i for i in range(1, 21)] score = [42, 47, 59, 27, 84, 49, 72, 43, 73, 59, 52, 49, 89, 27, 54, 49, 92, 45, 37, 95] data = pd.DataFrame({'ID': id, 'Score': score}) MM_Scaler = MinMaxScaler() MA_Scaler = MaxAbsScaler() Std_Scaler = StandardScaler() data1 = MM_Scaler.fit_transform(data['Score'].values.reshape(-1, 1)) data2 = MA_Scaler.fit_transform(data['Score'].values.reshape(-1, 1)) data3 = Std_Scaler.fit_transform(data['Score'].values.reshape(-1, 1)) data1 = pd.DataFrame({'ID': id, 'Score': data1.flatten()}) data2 = pd.DataFrame({'ID': id, 'Score': data2.flatten()}) data3 = pd.DataFrame({'ID': id, 'Score': data3.flatten()}) print(data1) print(data2) print(data3)
def main(): #args = prepare_optparser() # setting: #seed = args.seed seed = 1 np.random.seed(seed) torch.manual_seed(seed) batch_size = 64 lr = 0.0001 weight_decay = 5e-4 model_name = "VAE_test_v50" outdir = model_path k = 50 latent = 10 input_dim = 108633 encode_dim = [2048, 128] decode_dim = [1024]#[2048] max_iter = 3000 dims = [input_dim, latent, encode_dim, decode_dim] model = CR_VAE(dims, n_centroids=k) normalizer = MaxAbsScaler() expected_file_name = "/%s/DNase_test_peaks_top1000.csv" %(data_path) dataset = DNaseDataset(expected_file_name,expected_file_name,transpose = False) loader_params = {'batch_size': batch_size, 'shuffle': False,'num_workers': 16, 'drop_last': False, 'pin_memory': True} testloader = DataLoader(dataset,**loader_params) # Training model.init_gmm_params(testloader) #model.load_state_dict(torch.load('/%s/CR_VAE_model_VAE_test_v40.ckpt' %(model_path))) #***** input_files = [] for iteration in range(0,80): input_file_name = "/%s/DNase_train_peaks_top1000_%s.csv" %(data_path,str(iteration)) expected_file_name = "/%s/DNase_test_peaks_top1000.csv" %(data_path) input_files.append([input_file_name,expected_file_name]) model.fit(input_files, lr=lr, batch_size = batch_size, weight_decay = weight_decay, device = device, max_iter= max_iter, name = model_name, outdir = outdir ) torch.save(model.state_dict(), '/%s/CR_VAE_model_%s.ckpt' %(model_path,model_name)) torch.save(model, '/%s/CR_VAE_model_%s.tmp' %(model_path,model_name)) # Save the whole model ''' # output input = pd.read_csv(expected_file_name,sep="\t",header=0,index_col=0) model.load_state_dict(torch.load('/%s/CR_VAE_model_%s.ckpt' %(model_path,model_name), map_location=lambda storage, loc: storage),strict=False) feature = model.encodeBatch(testloader, device=device, out='z') feature = pd.DataFrame(feature) feature.index = input.columns.values feature.to_csv(os.path.join(outdir, '%s_feature.txt' %(model_name)), sep='\t', header=False,index=True) ''' '''
# Oto następujące kroki programu # 1. Pobranie bibliotek import numpy as np import cPickle as pickle from sklearn.svm import SVC from sklearn.metrics import accuracy_score from sklearn.preprocessing import MaxAbsScaler # 2. Pobranie danych raw_X = np.load("../../Test Data/2016-03-13/ratio_25/training_features.npy") raw_X = raw_X.astype("float64") raw_y = np.load("../../Test Data/2016-03-13/ratio_25/bots.npy") # 4. Ustalenie normalizacji normalizer = MaxAbsScaler() # 5. Lista numerów indeksów A = [1291, 885, 656, 1527, 1491, 89, 845, 293, 1296, 1076, 1303, 1278, 1185, 705, 184, 634, 484, 1104, 9, 1422, 623, 525, 1427, 1189, 252, 1055, 1226, 458, 1323, 442, 972, 1177, 30, 167, 959, 83, 22, 1159, 468, 183, 1324, 420, 1134, 1530, 730, 766, 640, 606, 750, 816, 823, 1410, 1094, 862, 1210, 1219, 1172, 1218, 1498, 627, 1168, 1175, 255, 191, 111, 437, 26, 1142, 609, 698, 616, 822, 1438, 861, 1256, 737, 25, 1520, 1160, 1035, 973, 1111, 327, 1044, 542, 1378, 250, 1307, 452, 613, 625, 1337, 825, 1261, 241, 407, 745, 733, 1505, 1077, 1435, 1039, 1349, 201, 511, 955, 1157, 336, 431, 230, 365, 610, 283, 1289, 62, 446, 1023, 423, 757, 641, 333, 33, 32, 907, 863, 7, 968, 204, 904, 99, 1116, 799, 1030, 740, 851, 805, 612, 1109, 648, 1020, 1545, 1117, 366, 633, 94, 769, 216, 670, 667, 1139, 887, 1231, 1453, 13, 1203, 1382, 12, 1114, 1353, 645, 435, 1411, 1536, 101, 245, 1188, 1412, 776, 1067, 52, 454, 1263, 1450, 704, 240, 351, 80, 1001, 923, 962, 1370,
class SNNAP: def __init__(self, clip_runtime=True, feature_selection='chi-squared', top_n=3, k_neighbours=60): self._name = 'snnap' self._clip_runtime = clip_runtime self._feature_selection = feature_selection self._top_n = top_n self._k_neighbours = k_neighbours self._imputer = SimpleImputer() self._scaler = MaxAbsScaler() self._runtime_scaler = StandardScaler() self._models = [] self._rfr_params = { 'n_estimators': 100, 'criterion': 'mse', 'max_depth': None, 'min_samples_split': 2 } def get_name(self): return self._name def fit(self, scenario: ASlibScenario, fold: int, num_instances: int): self._num_algorithms = len(scenario.algorithms) self._top_n = min(self._num_algorithms, self._top_n) # resample `amount_of_training_instances` instances and preprocess them accordingly features, performances = self._resample_instances( scenario.feature_data.values, scenario.performance_data.values, num_instances, random_state=fold) # TODO: apply feature filtering such as chi-squared based selection technique features, performances = self._preprocess_scenario( scenario, features, performances) # train runtime prediction model for each model self._models = [ RandomForestRegressor(random_state=fold, **self._rfr_params) for alg in range(self._num_algorithms) ] for num, model in enumerate(self._models): model.fit(features, performances[:, num]) # build index to retrieve k nearest neighbours based on Jaccard distance of best n solvers self._index = BallTree(performances, leaf_size=30, metric='pyfunc', func=SNNAP._top_n_jaccard, metric_params={'top_n': self._top_n}) self._performances = np.copy(performances) def predict(self, features, instance_id: int): assert (features.ndim == 1), '`features` must be one dimensional' features = np.expand_dims(features, axis=0) features = self._imputer.transform(features) features = self._scaler.transform(features) # predict runtimes and get k nearest neighbours based on Jaccard distance of best n solvers predicted = np.asarray([ model.predict(features) for model in self._models ]).reshape(1, -1) neighbour_idx = np.squeeze( self._index.query(predicted, self._k_neighbours, return_distance=False)) # find best solver on the instance's k nearest neighbours (best avg. runtime / PAR10 score) sub_performances = self._performances[neighbour_idx, :] # the summed performance induces a valid ranking return np.sum(sub_performances, axis=0) def _resample_instances(self, feature_data, performance_data, num_instances, random_state): num_instances = min(num_instances, np.size( performance_data, axis=0)) if num_instances > 0 else np.size( performance_data, axis=0) return resample(feature_data, performance_data, n_samples=num_instances, random_state=random_state) def _preprocess_scenario(self, scenario: ASlibScenario, features, performances): # TODO: paper does not explicitly mention feature imputation & feature scaling features = self._imputer.fit_transform(features) features = self._scaler.fit_transform(features) # train predictors and select algorithms on running time instead of PAR10 if warranted if self._clip_runtime: performances = np.clip(performances, a_min=np.NINF, a_max=scenario.algorithm_cutoff_time) # scale performances to zero mean and unitary standard deviation performances = self._runtime_scaler.fit_transform(performances) return features, performances @staticmethod def _top_n_jaccard(x, y, **kwargs): top_n = kwargs['metric_params']['top_n'] top_n_1 = set(np.argpartition(x, top_n)[:top_n]) top_n_2 = set(np.argpartition(y, top_n)[:top_n]) return len(top_n_1.intersection(top_n_2)) / float( len(top_n_1.union(top_n_2)))
from estimators import LSHNearestNeighbors from preprocessors import text_preprocess if __name__ == "__main__": df = pandas.read_csv("/media/alexander/b32bf4b4-8724-4107-9d19-abf6615c2f60/alexander/HELP_FILE/query.yaHotelId.showInTop.sure.final.tsv", sep="\t") print("Изначальная размерность данных:", df.shape,";", "Количество отелей:", len(df["yaHotelId"].unique())) sure_df = df[df["sure"]] print(sure_df.shape) filtered_values = [value[0] for value in sure_df["yaHotelId"].value_counts().iteritems() if value[1] >= 5] filtered_df = sure_df[sure_df["yaHotelId"].isin(filtered_values)] print("Получившаяся размерность данных:", filtered_df.shape, ";", "Количество отелей:", len(filtered_df["yaHotelId"].unique())) vectorizer = TfidfVectorizer(preprocessor=text_preprocess) y = np.array(filtered_df["yaHotelId"]) X = vectorizer.fit_transform(filtered_df["query"]) print("X shape:", X.shape) scaler = MaxAbsScaler() scaler.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) clf = LSHNearestNeighbors(n_estimators=10, n_candidates=100, n_neighbors=9, mode="parzen window") clf.fit(X_train, y_train) t1 = time.time() y_pred = clf.predict(X_test) t2 = time.time() - t1 print("delta time:", t2) print("mean time for one query:", t2/X_test.shape[0]) print("accuracy:", accuracy_score(y_test, y_pred))
model = Sequential() model.add(MaxoutDense(100, input_dim=42)) model.add(Activation('relu')) model.add(GaussianNoise(0.00001)) model.add(Dropout(0.3)) model.add(MaxoutDense(1, input_dim=100)) model.add(Activation('sigmoid')) #ada = Adagrad(lr=0.001) ada = SGD(lr=0.0003, momentum=0.9, decay=0.0001, nesterov=True) model.compile(optimizer=ada, loss='binary_crossentropy', metrics=['accuracy']) scaler = MaxAbsScaler() train_train_scaled = scaler.fit_transform(train_train[features]) train_test_scaled = scaler.transform(train_test[features]) model.fit(train_train_scaled, train_train.target.values, nb_epoch=150, batch_size=100) train_train_pred = model.predict(train_train_scaled, batch_size=100) train_test_pred = model.predict(train_test_scaled, batch_size=100) train_score = log_loss(train_train.target.values, train_train_pred) test_score = log_loss(train_test.target.values, train_test_pred) #test_poly = poly.transform(test[features]) test_scaled = scaler.transform(test[features]) test_pred = model.predict(test_scaled, batch_size=100)
def _digits_dataset(dtype=np.float32): X, y = load_digits(return_X_y=True) X = X.astype(dtype, copy=False) X = MaxAbsScaler().fit_transform(X) return X, y
# classColours['Water'] = [157,212,255] classColours['VegWater'] = [191, 255, 0] # define variables for the classification variables = [ 'VVMin', 'VHMin', 'VVdivVHMin', 'VVMax', 'VHMax', 'VVdivVHMax', 'VVAvg', 'VHAvg', 'VVdivVHAvg', 'VVStd', 'VHStd', 'VVdivVHStd' ] # run the classification classratutils.classifyWithinRAT(outputClumps, classesIntCol, classesNameCol, variables, classifier=classifier, classColours=classColours, preProcessor=MaxAbsScaler()) # export rat column to image gdalformat = 'GTiff' datatype = rsgislib.TYPE_8INT fields = ['OutClass'] rastergis.exportCols2GDALImage(outputClumps, outimage, gdalformat, datatype, fields) os.system('afplay /System/Library/Sounds/Tink.aiff') os.system('afplay /System/Library/Sounds/Tink.aiff') print('It took {0:0.1f} minutes'.format( (time.time() - start) / 60)) #time-stam
def _max_abs_scaler(column): sc = MaxAbsScaler() sc.fit(column.reshape(-1,1)) new_col = sc.transform(column.reshape(-1,1)) return(new_col)
def normalize_features(X: np.array) -> np.array: """Normalize features by scaling to [0,1]""" scaler = MaxAbsScaler().fit(X) return scaler.transform(X)
Trainset2 = xfcss_train.values[rannums,:] Testset = x_train[test_set,:] Testset2 = xfcss_train.values[test_set,:] # Trainy= y_gt[rannums,:] # Testy = y_gt[test_set,:] Trainy= y_train[rannums,:] Testy = y_train[test_set,:] # In[20]: # sc_X2 = StandardScaler() # sc_y = StandardScaler() sc_X2 = MaxAbsScaler() sc_y = MaxAbsScaler() # In[21]: Xtrainz = Trainset Xtrainz2 = Trainset2 ytrainz = Trainy X = Xtrainz X2 = sc_X2.fit_transform(Xtrainz2) y = sc_y.fit_transform(ytrainz) # In[22]:
def _mnist_dataset(dtype=np.float32): X, y = fetch_openml('mnist_784', version=1, return_X_y=True) X = X.astype(dtype, copy=False) X = MaxAbsScaler().fit_transform(X) return X, y
def main(): X, y = get_data('../../data/train.csv') sclr = MaxAbsScaler() X = sclr.fit_transform(X) # pickle.dump(sclr, open('./dumps/scaler_pickle', 'wb+')) X_test, y_test = get_data('../../data/val.csv') X_test = sclr.transform(X_test) X_fin, y_fin = get_data('../../data/test.csv') X_fin = sclr.transform(X_fin) other, yo = get_data('../../data/other.csv') other = sclr.transform(other) lin = linear_model.LogisticRegression( C=10000, ) # selector = RFE(lin, 21, step=1) # selector.fit(X, y) # X = selector.transform(X) # X_test = selector.transform(X_test) # X_fin = selector.transform(X_fin) # for i in range(len(selector.support_)): # print i+1, selector.support_[i] lin.fit(X, y) # pickle.dump(lin, open('./dumps/lin_reg_pickle', 'wb+')) x1 = lin.predict_proba(X) x1_test = lin.predict_proba(X_test) # x1_fin = lin.predict_proba(X_fin) # o1 = lin.predict_proba(other) print 'lin' print metrics.classification_report(y, lin.predict(X)) print metrics.classification_report(y_test, lin.predict(X_test)) print metrics.classification_report(y_fin, lin.predict(X_fin)) roc = lin.predict_proba(X_fin) # r = lin.predict(X_test) # l1 = [] # l2 = [] # for i in range(len(roc)): # if max(roc[i]) > 0.5: # l1.append(y_fin[i]) # l2.append(r[i]) # print 'dsfasdfasd' # print metrics.classification_report(l1, l2) # return fpr_grd0, tpr_grd0, _ = metrics.roc_curve(y_fin, roc[:, 0], pos_label=0) fpr_grd1, tpr_grd1, _ = metrics.roc_curve(y_fin, roc[:, 1], pos_label=1) fpr_grd2, tpr_grd2, _ = metrics.roc_curve(y_fin, roc[:, 2], pos_label=2) plt.plot(fpr_grd0, tpr_grd0, label='NRP') plt.plot(fpr_grd1, tpr_grd1, label='RiPP') plt.plot(fpr_grd2, tpr_grd2, label='Polyketide') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') plt.legend(loc='best') plt.show() # print lin.coef_ # print sum(lin.predict_proba(X_test)[0]) svm_model = SVC( C=5000, # kernel='linear', # degree=2, coef0=100, # probability=True, # shrinking=True, # class_weight='balanced', probability=True, # decision_function_shape='ovr' ) svm_model.fit(X, y) x2 = svm_model.predict_proba(X) x2_test = svm_model.predict_proba(X_test) x2_fin = svm_model.predict_proba(X_fin) o2 = svm_model.predict_proba(other) print 'svm' print metrics.classification_report(y, svm_model.predict(X)) print metrics.classification_report(y_test, svm_model.predict(X_test))