import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import MaxAbsScaler # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.9269704433497538 exported_pipeline = make_pipeline( MaxAbsScaler(), LogisticRegression(C=0.01, dual=False, penalty="l2")) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
from sklearn.preprocessing import MaxAbsScaler X = [[1., 10., 2.], [2., 0., 0.], [5., 1., -1.]] transformer = MaxAbsScaler().fit(X) transformer transformer.transform(X)
args = parser.parse_args() # Set random seed seed = args.seed np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): # cuda device device = 'cuda' torch.cuda.set_device(args.gpu) else: device = 'cpu' batch_size = args.batch_size normalizer = MaxAbsScaler() adata = load_data(args.dataset,transpose=args.transpose) args.min_peaks = int(args.min_peaks) if args.min_peaks >= 1 else args.min_peaks total_cells = adata.shape[0] min_cells = int(args.low * total_cells) max_cells = int(args.high * total_cells) filter_features(adata, min_cells=min_cells) filter_features(adata, max_cells=max_cells) filter_cells(adata,min_peaks=args.min_peaks) dataset = SingleCellDataset(adata,transforms=[normalizer.fit_transform]) trainloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True) testloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=False) cell_num = dataset.shape[0]
import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import MaxAbsScaler # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline(MaxAbsScaler(), LogisticRegression(penalty="l2")) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features)
df = pd.concat([Train_data, Test_data]) '''數據前處理''' df = df.fillna(-1) train_num = train_Y.shape[0] train_x = df[:train_num].values estimator = RandomForestRegressor() estimator.fit(train_x, train_Y) feats = pd.Series(data=estimator.feature_importances_, index=df.columns) feats = feats.sort_values(ascending=False) high_feature = list(feats[:37].index) df = df[high_feature].values MMEncoder = MaxAbsScaler() df = MMEncoder.fit_transform(df) test_X = df[train_num:] with tf.Session() as sess: saver = tf.train.import_meta_graph( "/home/rex/桌面/T-Brain/DNN/2019-06-01/test.meta") saver.restore(sess, "/home/rex/桌面/T-Brain/DNN/2019-06-01/test") graph = tf.get_default_graph() predict = tf.get_collection('predict')[0] X = graph.get_operation_by_name("X").outputs[0] print("模型恢復成功!")
class ParallelCoordinates(DataVisualizer): """ Parallel coordinates displays each feature as a vertical axis spaced evenly along the horizontal, and each instance as a line drawn between each individual axis. This allows you to detect braids of similar instances and separability that suggests a good classification problem. Parameters ---------- ax : matplotlib Axes, default: None The axis to plot the figure on. If None is passed in the current axes will be used (or generated if required). features : list, default: None a list of feature names to use If a DataFrame is passed to fit and features is None, feature names are selected as the columns of the DataFrame. classes : list, default: None a list of class names for the legend The class labels for each class in y, ordered by sorted class index. These names act as a label encoder for the legend, identifying integer classes or renaming string labels. If omitted, the class labels will be taken from the unique values in y. Note that the length of this list must match the number of unique values in y, otherwise an exception is raised. normalize : string or None, default: None specifies which normalization method to use, if any Current supported options are 'minmax', 'maxabs', 'standard', 'l1', and 'l2'. sample : float or int, default: 1.0 specifies how many examples to display from the data If int, specifies the maximum number of samples to display. If float, specifies a fraction between 0 and 1 to display. random_state : int, RandomState instance or None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random; only used if shuffle is True and sample < 1.0 shuffle : boolean, default: True specifies whether sample is drawn randomly colors : list or tuple, default: None A single color to plot all instances as or a list of colors to color each instance according to its class. If not enough colors per class are specified then the colors are treated as a cycle. colormap : string or cmap, default: None The colormap used to create the individual colors. If classes are specified the colormap is used to evenly space colors across each class. alpha : float, default: None Specify a transparency where 1 is completely opaque and 0 is completely transparent. This property makes densely clustered lines more visible. If None, the alpha is set to 0.5 in "fast" mode and 0.25 otherwise. fast : bool, default: False Fast mode improves the performance of the drawing time of parallel coordinates but produces an image that does not show the overlap of instances in the same class. Fast mode should be used when drawing all instances is too burdensome and sampling is not an option. vlines : boolean, default: True flag to determine vertical line display vlines_kwds : dict, default: None options to style or display the vertical lines, default: None kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. Attributes -------- n_samples_ : int number of samples included in the visualization object features_ : ndarray, shape (n_features,) The names of the features discovered or used in the visualizer that can be used as an index to access or modify data in X. If a user passes feature names in, those features are used. Otherwise the columns of a DataFrame are used or just simply the indices of the data array. classes_ : ndarray, shape (n_classes,) The class labels that define the discrete values in the target. Only available if the target type is discrete. This is guaranteed to be strings even if the classes are a different type. Examples -------- >>> visualizer = ParallelCoordinates() >>> visualizer.fit(X, y) >>> visualizer.transform(X) >>> visualizer.poof() Notes ----- These parameters can be influenced later on in the visualization process, but can and should be set as early as possible. """ NORMALIZERS = { "minmax": MinMaxScaler(), "maxabs": MaxAbsScaler(), "standard": StandardScaler(), "l1": Normalizer("l1"), "l2": Normalizer("l2"), } def __init__( self, ax=None, features=None, classes=None, normalize=None, sample=1.0, random_state=None, shuffle=False, colors=None, colormap=None, alpha=None, fast=False, vlines=True, vlines_kwds=None, **kwargs ): if "target_type" not in kwargs: kwargs["target_type"] = "discrete" super(ParallelCoordinates, self).__init__( ax=ax, features=features, classes=classes, colors=colors, colormap=colormap, **kwargs ) # Validate 'normalize' argument if normalize in self.NORMALIZERS or normalize is None: self.normalize = normalize else: raise YellowbrickValueError( "'{}' is an unrecognized normalization method".format(normalize) ) # Validate 'sample' argument if isinstance(sample, int): if sample < 1: raise YellowbrickValueError( "`sample` parameter of type `int` must be greater than 1" ) elif isinstance(sample, float): if sample <= 0 or sample > 1: raise YellowbrickValueError( "`sample` parameter of type `float` must be between 0 and 1" ) else: raise YellowbrickTypeError("`sample` parameter must be int or float") self.sample = sample # Set sample parameters if isinstance(shuffle, bool): self.shuffle = shuffle else: raise YellowbrickTypeError("`shuffle` parameter must be boolean") if self.shuffle: if (random_state is None) or isinstance(random_state, int): self._rng = RandomState(random_state) elif isinstance(random_state, RandomState): self._rng = random_state else: raise YellowbrickTypeError( "`random_state` must be None, int, or np.random.RandomState" ) else: self._rng = None # Visual and drawing parameters self.fast = fast self.alpha = alpha self.show_vlines = vlines self.vlines_kwds = vlines_kwds or {"linewidth": 1, "color": "black"} # Internal properties self._increments = None self._colors = None def fit(self, X, y=None, **kwargs): """ The fit method is the primary drawing input for the visualization since it has both the X and y data required for the viz and the transform method does not. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values kwargs : dict Pass generic arguments to the drawing method Returns ------- self : instance Returns the instance of the transformer/visualizer """ # Determine the features, classes, and colors super(ParallelCoordinates, self).fit(X, y) # Convert from pandas data types if is_dataframe(X): X = X.values if is_series(y): y = y.values # Ticks for each feature specified self._increments = np.arange(len(self.features_)) # Subsample instances X, y = self._subsample(X, y) # Normalize instances if self.normalize is not None: X = self.NORMALIZERS[self.normalize].fit_transform(X) self.draw(X, y, **kwargs) return self def draw(self, X, y, **kwargs): """ Called from the fit method, this method creates the parallel coordinates canvas and draws each instance and vertical lines on it. Parameters ---------- X : ndarray of shape n x m A matrix of n instances with m features y : ndarray of length n An array or series of target or class values kwargs : dict Pass generic arguments to the drawing method """ if self.fast: return self.draw_classes(X, y, **kwargs) return self.draw_instances(X, y, **kwargs) def draw_instances(self, X, y, **kwargs): """ Draw the instances colored by the target y such that each line is a single instance. This is the "slow" mode of drawing, since each instance has to be drawn individually. However, in so doing, the density of instances in braids is more apparent since lines have an independent alpha that is compounded in the figure. This is the default method of drawing. Parameters ---------- X : ndarray of shape n x m A matrix of n instances with m features y : ndarray of length n An array or series of target or class values Notes ----- This method can be used to draw additional instances onto the parallel coordinates before the figure is finalized. """ # Get alpha from param or default alpha = self.alpha or 0.25 for idx in range(len(X)): Xi = X[idx] yi = y[idx] color = self.get_colors([yi])[0] self.ax.plot(self._increments, Xi, color=color, alpha=alpha, **kwargs) return self.ax def draw_classes(self, X, y, **kwargs): """ Draw the instances colored by the target y such that each line is a single class. This is the "fast" mode of drawing, since the number of lines drawn equals the number of classes, rather than the number of instances. However, this drawing method sacrifices inter-class density of points using the alpha parameter. Parameters ---------- X : ndarray of shape n x m A matrix of n instances with m features y : ndarray of length n An array or series of target or class values """ # Get alpha from param or default alpha = self.alpha or 0.5 # Prepare to flatten data within each class: # introduce separation between individual data points using None in # x-values and arbitrary value (one) in y-values X_separated = np.hstack([X, np.ones((X.shape[0], 1))]) increments_separated = self._increments.tolist() increments_separated.append(None) # Get the classes that exist in the dataset, y y_values = np.unique(y) # Plot each class as a single line plot for yi in y_values: color = self.get_colors([yi])[0] X_in_class = X_separated[y == yi, :] increments_in_class = increments_separated * len(X_in_class) if len(X_in_class) > 0: self.ax.plot( increments_in_class, X_in_class.flatten(), linewidth=1, color=color, alpha=alpha, **kwargs ) return self.ax def finalize(self, **kwargs): """ Finalize executes any subclass-specific axes finalization steps. The user calls poof and poof calls finalize. Parameters ---------- kwargs: generic keyword arguments. """ # Set the title self.set_title( "Parallel Coordinates for {} Features".format(len(self.features_)) ) # Add the vertical lines # TODO: Make an independent function for override! if self.show_vlines: for idx in self._increments: self.ax.axvline(idx, **self.vlines_kwds) # Set the limits self.ax.set_xticks(self._increments) self.ax.set_xticklabels(self.features_) self.ax.set_xlim(self._increments[0], self._increments[-1]) # Add the legend sorting classes by name labels = sorted(list(self._colors.keys())) colors = [self._colors[lbl] for lbl in labels] manual_legend(self, labels, colors, loc="best", frameon=True) # Add the grid view self.ax.grid() def _subsample(self, X, y): # Choose a subset of samples if isinstance(self.sample, int): n_samples = min([self.sample, len(X)]) elif isinstance(self.sample, float): n_samples = int(len(X) * self.sample) if (n_samples < len(X)) and self.shuffle: indices = self._rng.choice(len(X), n_samples, replace=False) else: indices = slice(n_samples) X = X[indices, :] y = y[indices] self.n_samples_ = n_samples return X, y
y, test_size=0.3, random_state=7) for C in np.arange(0.05, 2, 0.05): for gamma in np.arange(0.001, 0.1, 0.001): svc = SVC(C=C, gamma=gamma) svc.fit(X_train, y_train) score = svc.score(X_test, y_test) if score > best_score: best_score = score print "C, gamma, score", C, gamma, score #maxabs norm = MaxAbsScaler() norm.fit(X) T = norm.transform(X) X_train, X_test, y_train, y_test = train_test_split(T, y, test_size=0.3, random_state=7) for C in np.arange(0.05, 2, 0.05): for gamma in np.arange(0.001, 0.1, 0.001): svc = SVC(C=C, gamma=gamma) svc.fit(X_train, y_train) score = svc.score(X_test, y_test) if score > best_score:
# 'trailer', # 'truck_age_at_orig', # 'orig_amt_>150k' ], axis=1) X = sm.add_constant(X) #### Normalization #### # Unit Norm norm = Normalizer() #X = pd.DataFrame(norm.fit_transform(X),columns=X.columns) ## Maximum absolute value max_abs = MaxAbsScaler() #X = pd.DataFrame(max_abs.fit_transform(X),columns=X.columns) # ## MinMax Scaling minmax = MinMaxScaler() #X = pd.DataFrame(minmax.fit_transform(X),columns=X.columns) Y = data['target'] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1) names = X_train.columns
# -*- coding: utf-8 -*- import pandas as pd import numpy as np import matplotlib.pyplot as plt from matplotlib.colors import BASE_COLORS from sklearn.preprocessing import LabelEncoder, MaxAbsScaler data = pd.read_csv('iris.csv').values X = data[:, :-1] # X = data[:, 2].reshape(-1, 1) y = data[:, -1] X = MaxAbsScaler().fit(X).transform(X) y = LabelEncoder().fit(y).transform(y) colormap = np.array(list(BASE_COLORS.keys())) plt.xlabel('x') plt.ylabel('y') for i in range(X.shape[1]): for j in range(i + 1, X.shape[1]): plt.scatter(X[:, i], X[:, j], c=colormap[y]) plt.show()
def main(): config_file="" acceptable_parameters={"shuffle" : "bool" , "learning_rate" : "str" , "penalty" : "str" , "loss" : "str" , "epsilon" : "float" , "alpha" : "float" , "power_t" : "float" , "n_iter" : "int" , "eta0" : "float" , "l1_ratio" : "float" , "random_state" : "int" , "verbose" : "int", } #usedense=true #standardize=true #use_log1p=true arguments=sys.argv print ("arguments: ",arguments ) if len(arguments)!=2: raise Exception(" was expecting only one argument pointing to the config file... process will terminate") else : config_file=arguments[1] dense,standardize,use_log1p,task_type,model_file,data_file,prediction_file,column, model_parameters=read_file_end_return_parameters(config_file, acceptable_parameters) #sanity checks if task_type not in ["train","predict"]: raise Exception("task needs to be either train or predict, here it was %s ... " % (task_type)) if model_file=="": raise Exception("model file cannot be empty") if data_file=="": raise Exception("data file file cannot be empty") if not os.path.isfile(data_file): raise Exception(" %s data file does not exist... " % (data_file)) if task_type=="predict" and prediction_file=="": raise Exception("prediction file cannot be empty when task=predict") if len(model_parameters)==0 and task_type=="train": raise Exception("model parameters cannot be empty") if column<1: raise Exception("columns cannot be less than 1...") if "validation_fraction" in model_parameters and model_parameters["validation_fraction"]>0.0: model_parameters["early_stopping"]=True if "solver" in model_parameters and model_parameters["solver"]=="sgd": model_parameters["nesterovs_momentum"]=True ################### Model training ############### if task_type =="train": st=StandardScaler() ab=MaxAbsScaler() X,y=get_data(data_file, column) #load data model=SGDRegressor(**model_parameters) # set model parameters if dense: #convert to dense - useful if the data does nto have high dimensionality . #Also sklearn models are not optimzied for sparse data in tree-cased algos X=X.toarray() if use_log1p : X[X<0]=0 X=np.log1p(X) if standardize: X=st.fit_transform(X) model.fit(X,y) #fitting model joblib.dump((model,st) , model_file) else : if use_log1p: X[X<0]=0 X=csr_matrix(X).log1p() if standardize : X=ab.fit_transform(X) model.fit(X,y) #fitting model joblib.dump((model,ab) , model_file) if not os.path.isfile(model_file): raise Exception(" %s model file could not be exported - check permissions ... " % (model_file)) sys.exit(-1)# exit script ################### predicting ############### else : if not os.path.isfile(model_file): raise Exception(" %s model file could not be imported " % (model_file)) X,y=get_data(data_file, column) #load data model,scaler=joblib.load(model_file) if dense: #convert to dense - useful if the data does nto have high dimensionality . #Also sklearn models are not optimzied for sparse data in tree-cased algos X=X.toarray() if use_log1p : X[X<0]=0 X=np.log1p(X) else : if use_log1p: X[X<0]=0 X=csr_matrix(X).log1p() if standardize: X=scaler.transform(X) preds=model.predict(X) np.savetxt(prediction_file, preds, delimiter=",", fmt='%.9f') if not os.path.isfile(prediction_file): raise Exception(" %s prediction file could not be exported - check permissions ... " % (prediction_file)) sys.exit(-1)# exit script
data = data.drop(data[data['hminus_TRACK_Type'] == 5].index) variables = [ "V0_ENDVERTEX_Z", "hplus_P", "hplus_PT", "hminus_PT", "hminus_PZ", "Angle", "nLongTracks", "V0_ENDVERTEX_CHI2", "hplus_TRACK_GhostProb", "hminus_TRACK_CHI2NDOF" ] #"V0_ENDVERTEX_Y""nTracks","V0_ORIVX_Y","V0_ORIVX_CHI2" # down==1 variables: "V0_ENDVERTEX_CHI2","V0_ENDVERTEX_Z","V0_ENDVERTEX_Y","hplus_P","hplus_PY","hminus_P","Angle","nTracks","V0_ORIVX_X","V0_ORIVX_Z","V0_ORIVX_CHI2","hplus_IP_OWNPV" #Declaring real masses from pdg # data= data.drop(["Track_type"],axis=1) pionm = 139.57061 protonm = 938.272081 lambdam = 1115.683 cs = MaxAbsScaler() # norm = data.max() - data.min() # data = data - [data.min()[0],data.min()[1],data.min()[2],data.min()[3],data.min()[4],data.min()[5],0] # data = data.div([norm[0],norm[1],norm[2],norm[3],norm[4],norm[5],1]) data = data.dropna() data = data.drop(data[data.Resolution.abs() > 10].index) datamlp = data[variables] labelmlp = data["Resolution"].abs() Xtrain, Xvalid, Ytrain, Yvalid = train_test_split(data[variables], data["Resolution"].abs(), test_size=0.3) datamlp = cs.fit_transform(datamlp) Xtrain = cs.transform(Xtrain)
data_clients.CODE_GENDER = labelencoder.fit_transform(data_clients.CODE_GENDER) data_clients.FLAG_OWN_CAR = labelencoder.fit_transform( data_clients.FLAG_OWN_CAR) data_clients.FLAG_OWN_REALTY = labelencoder.fit_transform( data_clients.FLAG_OWN_REALTY) data_clients.NAME_TYPE_SUITE = labelencoder.fit_transform( data_clients.NAME_TYPE_SUITE) data_clients.NAME_INCOME_TYPE = labelencoder.fit_transform( data_clients.NAME_INCOME_TYPE) data_clients.NAME_EDUCATION_TYPE = labelencoder.fit_transform( data_clients.NAME_EDUCATION_TYPE) data_clients.NAME_FAMILY_STATUS = labelencoder.fit_transform( data_clients.NAME_FAMILY_STATUS) data_clients.NAME_HOUSING_TYPE = labelencoder.fit_transform( data_clients.NAME_HOUSING_TYPE) data_clients.OCCUPATION_TYPE = labelencoder.fit_transform( data_clients.OCCUPATION_TYPE) data_clients.WEEKDAY_APPR_PROCESS_START = labelencoder.fit_transform( data_clients.WEEKDAY_APPR_PROCESS_START) data_clients.ORGANIZATION_TYPE = labelencoder.fit_transform( data_clients.ORGANIZATION_TYPE) scaler_maxabs = MaxAbsScaler() data = scaler_maxabs.fit_transform(data_clients) X_train = PCA(n_components=40).fit_transform(data) resultat = model.predict(X_train) dataset['TARGET_p'] = resultat.tolist() dataset.to_csv('predict.csv')
import numpy as np from sklearn.ensemble import VotingClassifier from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler, MinMaxScaler # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv("PATH/TO/DATA/FILE", delimiter="COLUMN_SEPARATOR", dtype=np.float64) features = np.delete( tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index("class"), axis=1, ) ( training_features, testing_features, training_classes, testing_classes, ) = train_test_split(features, tpot_data["class"], random_state=42) exported_pipeline = make_pipeline( MaxAbsScaler(), MinMaxScaler(), LogisticRegression(C=49.0, dual=True, penalty="l2")) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features)
import numpy as np from sklearn.cross_validation import train_test_split from sklearn.decomposition import RandomizedPCA from sklearn.ensemble import RandomForestClassifier, VotingClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler from sklearn.svm import LinearSVC # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( make_union( VotingClassifier([("est", RandomForestClassifier(n_estimators=500))]), FunctionTransformer(lambda X: X)), MaxAbsScaler(), RandomizedPCA(iterated_power=10), LinearSVC(C=0.79, dual=False, penalty="l1")) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features)
def normaliza(dados): p = MaxAbsScaler() p.fit(dados) return p.transform(dados)
from sklearn.preprocessing import RobustScaler from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_allclose iris = load_iris() def _get_valid_samples_by_column(X, col): """Get non NaN samples in column of X""" return X[:, [col]][~np.isnan(X[:, col])] @pytest.mark.parametrize( "est, func, support_sparse, strictly_positive, omit_kwargs", [(MaxAbsScaler(), maxabs_scale, True, False, []), (MinMaxScaler(), minmax_scale, False, False, ['clip']), (StandardScaler(), scale, False, False, []), (StandardScaler(with_mean=False), scale, True, False, []), (PowerTransformer('yeo-johnson'), power_transform, False, False, []), (PowerTransformer('box-cox'), power_transform, False, True, []), (QuantileTransformer(n_quantiles=10), quantile_transform, True, False, []), (RobustScaler(), robust_scale, False, False, []), (RobustScaler(with_centering=False), robust_scale, True, False, [])]) def test_missing_value_handling(est, func, support_sparse, strictly_positive, omit_kwargs): # check that the preprocessing method let pass nan rng = np.random.RandomState(42) X = iris.data.copy() n_missing = 50 X[rng.randint(X.shape[0], size=n_missing),
def randomForest( dataFrame, targetColumn, featureNames ): dataFrame = dataFrame[featureNames] FEATURE_NAMES = list(dataFrame.columns); FEATURE_NAMES.remove(targetColumn) COLUMNS = list(dataFrame.columns); LABEL = targetColumn; Y_dataFrame = dataFrame[[targetColumn]]; Y_values = Y_dataFrame.values; X_dataFrame = dataFrame.drop(targetColumn, axis=1); X_values = X_dataFrame.values; Y_values = Y_values print(X_dataFrame.describe()) FEATURE_DEFAULTS = ((X_dataFrame.max() + X_dataFrame.min()) * 0.5).to_dict() # preprocessorY = MinMaxScaler() # preprocessorY = StandardScaler() preprocessorY = MaxAbsScaler() preprocessorY.fit(Y_values) preprocessorX = MinMaxScaler() # preprocessorX = StandardScaler() preprocessorX.fit(X_values) Y_values = preprocessorY.transform(Y_values) X_values = preprocessorX.transform(X_values) X_numpyTrainVal, X_numpyTest, Y_numpyTrainVal, Y_numpyTest = train_test_split(X_values, Y_values, test_size=0.1) model = RandomForestRegressor(n_estimators=100, n_jobs = -1) model.fit(X_numpyTrainVal, Y_numpyTrainVal) # обучение Y_numpyPredict = model.predict(X_numpyTest) # предсказание X_numpyTotal = X_values Y_numpyTotal = Y_values eps = 0.001 Y_relErr = np.abs(Y_numpyPredict - Y_numpyTest.flatten()) / (Y_numpyTest + eps) for threshold in [0.025, 0.05, 0.10, 0.15]: bad_s = np.sum((Y_relErr > threshold)) good_s = np.sum((Y_relErr <= threshold)) total_s = Y_relErr.size print("threshold = {:5}, good = {:10}, bad = {:10}, err = {:4}".format(threshold, good_s/total_s, bad_s/total_s, bad_s / (good_s + bad_s))) Y_numpyPredict = preprocessorY.inverse_transform(Y_numpyPredict.reshape(-1, 1)) Y_numpyTest = preprocessorY.inverse_transform(Y_numpyTest.reshape(-1, 1)) modelPacket = dict() modelPacket['model'] = model modelPacket['preprocessorX'] = preprocessorX modelPacket['preprocessorY'] = preprocessorY modelPacket['feature_names'] = FEATURE_NAMES modelPacket['feature_defaults'] = FEATURE_DEFAULTS threshold = 10 print() Y_relativeError = np.abs(Y_numpyPredict - Y_numpyTest) * 100 / Y_numpyTest allValues = Y_numpyTest mask = Y_relativeError > threshold badValues = Y_numpyTest[mask] mask = Y_relativeError <= threshold goodValues = Y_numpyTest[mask] bins = range(1, 20) bins = [i * 0.5e6 for i in bins] figure, axes = plt.subplots(3, 1) axes[1].axis('tight') axes[1].axis('off') resultValues = axes[0].hist([allValues, goodValues, badValues], bins=bins, histtype='bar', color=['green', 'yellow', 'red']) allValues = resultValues[0][0]; goodValues = resultValues[0][1]; badValues = resultValues[0][2]; accuracy = goodValues * 100 / (allValues + 0.01) col_label = ['{:5d}'.format(int((bins[i + 0] + bins[i + 1]) / 2)) for i in range(len(bins) - 1)] cell_text = [['{:2.1f}'.format(acc_) for acc_ in accuracy], ] table_ = axes[1].table(cellText=cell_text, colLabels=col_label, loc='center') table_.auto_set_font_size(False) table_.set_fontsize(8) Y_numpyTest_max = np.max(Y_numpyTest) Y_numpyTest_min = np.min(Y_numpyTest) # axes[2].set_position([Y_numpyTotal_min-Y_numpyTotal_width*0.1,Y_numpyTotal_min-Y_numpyTotal_width*0.1,Y_numpyTotal_width*0.2,Y_numpyTotal_width*0.2]) axes[2].plot(Y_numpyTest, Y_numpyTest, c='blue') axes[2].plot(Y_numpyTest, Y_numpyTest * (1.0 + 0.1), c='red') axes[2].plot(Y_numpyTest, Y_numpyTest * (1.0 - 0.1), c='red') axes[2].scatter(Y_numpyPredict, Y_numpyTest) plt.show() # figure, axes =plt.subplots(3,1) # clust_data = np.random.random((10,3)) # collabel=("col 1", "col 2", "col 3") # axs[0].axis('tight') # axs[0].axis('off') # the_table = axs[0].table(cellText=clust_data,colLabels=collabel,loc='center') # axs[1].plot(clust_data[:,0],clust_data[:,1]) # plt.show() return modelPacket, (Y_numpyPredict, Y_numpyTotal)
import numpy as np import pandas as pd from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import Matern from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import MaxAbsScaler, RobustScaler from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=123) # Average CV score on the training set was: 0.9604589764296974 exported_pipeline = make_pipeline( RobustScaler(), MaxAbsScaler(), GaussianProcessRegressor(kernel=Matern(length_scale=4.0, nu=2.5), n_restarts_optimizer=185, normalize_y=False)) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 123) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def train_ann(X_array, Y_array, scaling_input=None, scaling_output=None, reduce_pca=False, outfile=None, regressor_opts={'activation': 'logistic'}): print('Fitting Artificial Neural Network') Y_array = np.asarray(Y_array) X_array = np.asarray(X_array) pca = None input_scaler = None output_scaler = None # Normalize the input if scaling_input == 'minmax': input_scaler = MinMaxScaler() X_array = input_scaler.fit_transform(X_array) if outfile: fid = open(outfile + '_scaler_input', 'wb') pickle.dump(input_scaler, fid, -1) fid.close() elif scaling_input == 'maxabs': input_scaler = MaxAbsScaler() X_array = input_scaler.fit_transform(X_array) if outfile: fid = open(outfile + '_scaler_input', 'wb') pickle.dump(input_scaler, fid, -1) fid.close() elif scaling_input == 'normalize': input_scaler = StandardScaler() X_array = input_scaler.fit_transform(X_array) if outfile: fid = open(outfile + '_scaler_input', 'wb') pickle.dump(input_scaler, fid, -1) fid.close() if reduce_pca: #Reduce input variables using Principal Component Analisys pca = PCA(n_components=10) X_array = pca.fit_transform(X_array) if outfile: fid = open(outfile + '_PCA', 'wb') pickle.dump(pca, fid, -1) fid.close() # Normalize the output if scaling_output == 'minmax': output_scaler = MinMaxScaler() Y_array = output_scaler.fit_transform(Y_array) if outfile: fid = open(outfile + '_scaler_output', 'wb') pickle.dump(output_scaler, fid, -1) fid.close() elif scaling_output == 'maxabs': output_scaler = MaxAbsScaler() Y_array = output_scaler.fit_transform(Y_array) if outfile: fid = open(outfile + '_scaler_output', 'wb') pickle.dump(output_scaler, fid, -1) fid.close() elif scaling_output == 'normalize': output_scaler = StandardScaler() Y_array = output_scaler.fit_transform(Y_array) if outfile: fid = open(outfile + '_scaler_output', 'wb') pickle.dump(output_scaler, fid, -1) fid.close() # Get the number of bands to set the ANN structure ann = ann_sklearn.MLPRegressor(**regressor_opts) ANN = ann.fit(X_array, Y_array) if outfile: fid = open(outfile, 'wb') pickle.dump(ANN, fid, -1) fid.close() return ANN, input_scaler, output_scaler, pca
from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import MaxAbsScaler, PolynomialFeatures from sklearn.svm import LinearSVR from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: -3.6334721147216316 exported_pipeline = make_pipeline( make_union(FunctionTransformer(copy), FunctionTransformer(copy)), SelectPercentile(score_func=f_regression, percentile=89), VarianceThreshold(threshold=0.1), MaxAbsScaler(), PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), LinearSVR(C=1.0, dual=True, epsilon=1.0, loss="epsilon_insensitive", tol=0.0001)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
from sklearn.feature_selection import SelectFwe, SelectKBest, SelectPercentile, VarianceThreshold from sklearn.feature_selection import SelectFromModel, RFE from sklearn.ensemble import ExtraTreesClassifier from sklearn.svm import SVC from sklearn.model_selection import cross_val_predict from sklearn.metrics import accuracy_score, f1_score from tpot_metrics import balanced_accuracy_score from sklearn.pipeline import make_pipeline import itertools dataset = sys.argv[1] preprocessor_list = [ Binarizer(), MaxAbsScaler(), MinMaxScaler(), Normalizer(), PolynomialFeatures(), RobustScaler(), StandardScaler(), FastICA(), PCA(), RBFSampler(), Nystroem(), FeatureAgglomeration(), SelectFwe(), SelectKBest(), SelectPercentile(), VarianceThreshold(), SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=100)),
verbosity=0)), MinMaxScaler(), MinMaxScaler(), StackingEstimator(estimator=SGDRegressor(alpha=0.01, eta0=0.01, fit_intercept=False, l1_ratio=0.0, learning_rate="constant", loss="huber", penalty="elasticnet", power_t=0.0)), StackingEstimator(estimator=LinearSVR(C=25.0, dual=True, epsilon=0.01, loss="epsilon_insensitive", tol=0.0001)), FeatureAgglomeration(affinity="l2", linkage="average"), MaxAbsScaler(), SelectPercentile(score_func=f_regression, percentile=6), StackingEstimator( estimator=GradientBoostingRegressor(alpha=0.9, learning_rate=0.1, loss="huber", max_depth=3, max_features=0.1, min_samples_leaf=13, min_samples_split=11, n_estimators=10, subsample=0.7000000000000001)), StackingEstimator(estimator=LinearSVR(C=20.0, dual=True, epsilon=1.0, loss="squared_epsilon_insensitive",
# Import functional utilities from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler from sklearn.pipeline import FeatureUnion # Perform preprocessing get_text_data = FunctionTransformer(combine_text_columns, validate=False) get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False) # Create the token pattern: TOKENS_ALPHANUMERIC TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)' # Instantiate pipeline: pl pl = Pipeline([ ('union', FeatureUnion( transformer_list = [ ('numeric_features', Pipeline([ ('selector', get_numeric_data), ('imputer', Imputer()) ])), ('text_features', Pipeline([ ('selector', get_text_data), ('vectorizer', CountVectorizer(token_pattern= TOKENS_ALPHANUMERIC, ngram_range=(1, 2))), ('dim_red', SelectKBest(chi2, chi_k)) ])) ] )), ('scale', MaxAbsScaler()), ('clf', OneVsRestClassifier(LogisticRegression())) ])
training_end = time.perf_counter() prediction_start = time.perf_counter() preds = knn.predict(X_test) prediction_end = time.perf_counter() acc_knn = (preds == y_test).sum().astype(float) / len(preds) * 100 knn_train_time = training_end - training_start knn_prediction_time = prediction_end - prediction_start print("K Nearest Neighbors Classifier's prediction accuracy is: %3.2f" % (acc_knn)) print("Time consumed for training: %4.3f seconds" % (knn_train_time)) print("Time consumed for prediction: %6.5f seconds" % (knn_prediction_time)) ###Naive Bayes from sklearn.preprocessing import MaxAbsScaler scaler_gnb = MaxAbsScaler() sdss = scaler_gnb.fit_transform(sdss_df_fe.drop('Category_list', axis=1)) X_train_gnb, X_test_gnb, y_train_gnb, y_test_gnb = train_test_split( sdss, sdss_df_fe['Category_list'], test_size=0.33) gnb = GaussianNB() training_start = time.perf_counter() gnb.fit(X_train_gnb, y_train_gnb) training_end = time.perf_counter() prediction_start = time.perf_counter() preds = gnb.predict(X_test_gnb) prediction_end = time.perf_counter() acc_gnb = (preds == y_test_gnb).sum().astype(float) / len(preds) * 100 gnb_train_time = training_end - training_start gnb_prediction_time = prediction_end - prediction_start print("Gaussian Naive Bayes Classifier's prediction accuracy is: %3.2f" %
def maxabsscaler(data): data = MaxAbsScaler().fit_transform(data) return data
} # Take only 2 features to make visualization easier # Feature MedInc has a long tail distribution. # Feature AveOccup has a few but very large outliers. features = ['MedInc', 'AveOccup'] features_idx = [feature_names.index(feature) for feature in features] X = X_full[:, features_idx] distributions = [ ('Unscaled data', X), ('Data after standard scaling', StandardScaler().fit_transform(X)), ('Data after min-max scaling', MinMaxScaler().fit_transform(X)), ('Data after max-abs scaling', MaxAbsScaler().fit_transform(X)), ('Data after robust scaling', RobustScaler(quantile_range=(25, 75)).fit_transform(X)), ('Data after power transformation (Yeo-Johnson)', PowerTransformer(method='yeo-johnson').fit_transform(X)), ('Data after power transformation (Box-Cox)', PowerTransformer(method='box-cox').fit_transform(X)), ('Data after quantile transformation (uniform pdf)', QuantileTransformer(output_distribution='uniform') .fit_transform(X)), ('Data after quantile transformation (gaussian pdf)', QuantileTransformer(output_distribution='normal') .fit_transform(X)), ('Data after sample-wise L2 normalizing', Normalizer().fit_transform(X)), ]
@author: damian.campo """ from sklearn import datasets from sklearn.cluster import KMeans import matplotlib.pyplot as plt from sklearn.preprocessing import MaxAbsScaler # See Normalizer and StandardScaler # Load dataset iris = datasets.load_iris() samples = iris.data #print(samples) # Preprocess the samples so that all features are between 0 and 1 scaler = MaxAbsScaler() scaler.fit(samples) samples = scaler.transform(samples) model = KMeans(n_clusters=3) model.fit(samples) labels = model.predict(samples) target = iris.target plt.figure() # for plotting purposes only features 2 and 3 are taking into consideration plt.scatter(samples[:, 2], samples[:, 3], c=target) plt.title('Ground truth clusters (Based on 2 features)') centroids = model.cluster_centers_ centroids_x = centroids[:, 2]
on='userId', how='left') columns = test_data.columns.tolist() df = test_data.merge(user_feature, on='userId', how='left') df1 = df.merge(offline_user_mer_feature, on=['userId', 'merchantId'], how='left') df2 = df1.merge(offline_mer_feature, on='merchantId', how='left') df2.fillna(np.nan, inplace=True) df2.drop(columns, axis=1, inplace=True) df2.to_csv('resource/train_features.csv', index=False) # train_features = pd.read_csv(train_feature_path).astype(float) columns = df2.columns.tolist() for col in columns: if col is 'userAverageDistance_y' or col is 'userAverageDistance_x': df2[col] = df2[col].fillna(-1) else: df2[col] = df2[col].fillna(0) max_abs_scaler = MaxAbsScaler() x_test_maxabs = max_abs_scaler.fit_transform(df2) clf = joblib.load('model/xgb/xgb_model.pkl') test_matrix = xgboost.DMatrix(df2.values, feature_names=df2.columns) y_pre = clf.predict(test_matrix) y_pre_df = pd.DataFrame(pd.Series(y_pre), columns=['Probability']) submit_df = test_data[['userId', 'Coupon_id', 'Date_received']].join(y_pre_df) submit_df.rename(columns={'userId': 'User_id'}) submit_df.to_csv('submit/submit.csv', index=False)
from sklearn.preprocessing import RobustScaler from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_allclose iris = load_iris() def _get_valid_samples_by_column(X, col): """Get non NaN samples in column of X""" return X[:, [col]][~np.isnan(X[:, col])] @pytest.mark.parametrize( "est, func, support_sparse, strictly_positive", [(MaxAbsScaler(), maxabs_scale, True, False), (MinMaxScaler(), minmax_scale, False, False), (StandardScaler(), scale, False, False), (StandardScaler(with_mean=False), scale, True, False), (PowerTransformer('yeo-johnson'), power_transform, False, False), (PowerTransformer('box-cox'), power_transform, False, True), (QuantileTransformer(n_quantiles=10), quantile_transform, True, False), (RobustScaler(), robust_scale, False, False), (RobustScaler(with_centering=False), robust_scale, True, False)]) def test_missing_value_handling(est, func, support_sparse, strictly_positive): # check that the preprocessing method let pass nan rng = np.random.RandomState(42) X = iris.data.copy() n_missing = 50 X[rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing)] = np.nan
from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import MaxAbsScaler, RobustScaler from tpot.builtins import StackingEstimator, ZeroCount from xgboost import XGBRegressor from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Average CV score on the training set was:-148.31589276097782 exported_pipeline = make_pipeline( make_union( make_pipeline(MaxAbsScaler(), RobustScaler(), ZeroCount(), SelectFwe(score_func=f_regression, alpha=0.038)), FunctionTransformer(copy)), XGBRegressor(learning_rate=0.1, max_depth=9, min_child_weight=15, n_estimators=100, nthread=1, subsample=1.0)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)