def get_data(): df_train = pd.read_csv('data_original/aps_failure_training_set.csv') df_test = pd.read_csv('data_original/aps_failure_test_set.csv') # df_train.replace('na','-1', inplace=True) df_test.replace('na','-1', inplace=True) # categorical for label: 0: neg, 1: pos df_train['class'] = pd.Categorical(df_train['class']).codes df_test['class'] = pd.Categorical(df_test['class']).codes # split data into x and y Y_train = df_train['class'].copy(deep=True) X_train = df_train.copy(deep=True) X_train.drop(['class'], inplace=True, axis=1) Y_test = df_test['class'].copy(deep=True) X_test = df_test.copy(deep=True) X_test.drop(['class'], inplace=True, axis=1) # strings to float X_train = X_train.astype('float64') X_test = X_test.astype('float64') # scale the dataset scaler = MaxAbsScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) return X_train, Y_train, X_test, Y_test
def cv(model, x, y): errors = [] kf = KFold(n_splits=10, shuffle=True) for train_index, test_index in kf.split(x): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] x_scaler = MaxAbsScaler() y_scaler = MaxAbsScaler() x_scaler.fit(x_train) y_scaler.fit(y_train) xx_train = x_scaler.transform(x_train) xx_test = x_scaler.transform(x_test) yy_train = y_scaler.transform(y_train) yy_test = y_scaler.transform(y_test) cv_model = sklearn.base.clone(model) cv_model.fit(xx_train, yy_train) yy_predicted = cv_model.predict(xx_test) error = math.sqrt(mean_squared_error(yy_test, yy_predicted)) errors.append(error) return errors
def scale(self, X_train, X_test, type): if type == "StandardScaler": standardScaler = StandardScaler() standardScaler.fit(X_train) X_train = standardScaler.transform(X_train) X_test = standardScaler.transform(X_test) return X_train, X_test elif type == "MinMaxScaler": minMaxScaler = MinMaxScaler() minMaxScaler.fit(X_train) X_train = minMaxScaler.transform(X_train) X_test = minMaxScaler.transform(X_test) return X_train, X_test elif type == "MaxScaler": maxScaler = MaxAbsScaler() maxScaler.fit(X_train) X_train = maxScaler.transform(X_train) X_test = maxScaler.transform(X_test) return X_train, X_test elif type == "RobustScaler": robustScaler = RobustScaler() robustScaler.fit(X_train) X_train = robustScaler.transform(X_train) X_test = robustScaler.transform(X_test) return X_train, X_test
def lasso(input_df, output, plot=False, normalize=False): print('LASSO #################') if normalize: transformer = MaxAbsScaler().fit(input_df.values) X = transformer.transform(input_df.values) transformer = MaxAbsScaler().fit(output.values.reshape(-1, 1)) y = transformer.transform(output.values.reshape(-1, 1)) else: X = input_df.values y = output.values.reshape(-1, 1) # defaults : tol=1e-4, max_iter=1000, # reg = LassoCV(max_iter=8000, tol=1e-1) reg = LassoCV(max_iter=8000, tol=0.1) reg.fit(X, y) # print("Best alpha using built-in LassoCV: %f" % reg.alpha_) # print("Best score using built-in LassoCV: %f" %reg.score(X,y)) coef = pd.Series(reg.coef_, index = input_df.columns) imp_coef = coef.sort_values() # print(imp_coef) if plot: matplotlib.rcParams['figure.figsize'] = (8.0, 10.0) imp_coef.plot(kind = "barh") plt.title("Feature importance using Lasso Model") plt.show() return coef
def normalization_a(): from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler # 区间缩放,返回值为缩放到[0, 1]区间的数据 minMaxScaler = MinMaxScaler().fit(data) minMaxScaler.transform(data) MaxAbsScaler = MaxAbsScaler().fit(data) MaxAbsScaler.transform(data)
def load_dataset(dataset): ret = LOADERS[dataset]() X_tr, y_tr, X_te, y_te = ret transformer = MaxAbsScaler().fit(X_tr) X_tr = transformer.transform(X_tr) if not X_te is None: transformer = MaxAbsScaler().fit(X_te) X_te = transformer.transform(X_te) return X_tr, y_tr, X_te, y_te
def maxAbsScaler(train_x, test_x): scaler = MaxAbsScaler() scaler.fit(train_x.data) train_x.data = scaler.transform(train_x.data) if test_x is not None: test_x.data = scaler.transform(test_x.data) print(pd.DataFrame(train_x.data).describe()) return train_x.data
def run_classification(data, labels, test_idx, trees, c): All_scores = [] length = len(data[0]) #print len(data) total_AUPR_training = 0 total_AUPR_testing = 0 folds_AUPR = [] folds_AUC = [] folds_precision = [] folds_recall = [] folds_f1 = [] for fold_data, test_idx_fold in zip(data, test_idx): train_idx_fold = [] for idx in range(length): if idx not in test_idx_fold: train_idx_fold.append(idx) fold_data = np.array(fold_data) test_idx_fold = np.array(test_idx_fold) train_idx_fold = np.array(train_idx_fold) X_train, X_test = fold_data[train_idx_fold, ], fold_data[ test_idx_fold, ] y_train, y_test = np.array(train_idx_fold), np.array(test_idx_fold) max_abs_scaler = MaxAbsScaler() X_train_maxabs_fit = max_abs_scaler.fit(X_train) X_train_maxabs_transform = max_abs_scaler.transform(X_train) X_test_maxabs_transform = max_abs_scaler.transform(X_test) rf = RandomForestClassifier(n_estimators=trees, n_jobs=6, criterion=c, class_weight="balanced", random_state=1357) rf.fit(X_train_maxabs_transform, y_train) try: scores_training = rf.decision_function(X_train_maxabs_transform) scores_testing = rf.decision_function(X_test_maxabs_transform) except: scores_training = rf.predict_proba(X_train_maxabs_transform)[:, 1] scores_testing = rf.predict_proba(X_test_maxabs_transform)[:, 1] y_pred = rf.predict_proba(X_test_maxabs_transform) All_scores.append(scores_testing) rf_fpr, rf_tpr, rf_thr = roc_curve(y_test, scores_testing) auc_val = auc(rf_fpr, rf_tpr) print(y_test) return All_scores
def normalize(train, test, val): start_time = datetime.datetime.now() scaler = MaxAbsScaler() #scaler = MinMaxScaler() #scaler = StandardScaler() X_train = scaler.fit_transform(train) X_val = scaler.transform(val) X_test = scaler.transform(test) end_time = datetime.datetime.now() print("normalization time taken - {}".format(end_time - start_time)) return X_train, X_test, X_val
def MaxAbs_Scaler(X_train, X_test): """ Max min standardization: scale each feature by its maximum absolute value. :param X_train: array-like training data; :param X_test: array-like test data; :return: standardized training data and test data, and the scaler """ scaler = MaxAbsScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) return X_train, X_test, scaler
def convert_to_timeseries(X_train, X_test, time_step, feature_num, scale=True): if scale == True: transformer = MaxAbsScaler().fit(X_train) X_train = transformer.transform(X_train) X_test = transformer.transform(X_test) X_train = X_train.reshape(X_train.shape[0], feature_num, time_step) X_train = np.array([X_train[x].T for x in range(X_train.shape[0])]) X_test = X_test.reshape(X_test.shape[0], feature_num, time_step) X_test = np.array([X_test[x].T for x in range(X_test.shape[0])]) return X_train, X_test
def multiple_model_runner(df): X_train, X_test, y_train, y_test = preprocess( df, min_df=100, ngram_range=(1, 1), penalty="l2") # **config_build_models scaling = MaxAbsScaler().fit(X_train) X_train_scaled = scaling.transform(X_train) X_test_scaled = scaling.transform(X_test) # Append classifier to preprocessing pipeline for three kinds of models # Leave all with default parameters for now - will fit hyperparameters for the best model after log_reg = Pipeline(steps=[('classifier', LogisticRegression(class_weight='balanced'))]) clf = Pipeline(steps=[('classifier', DecisionTreeClassifier(class_weight='balanced'))]) rd = Pipeline(steps=[('classifier', RandomForestClassifier(class_weight='balanced'))]) svm_class = Pipeline(steps=[('classifier', svm.LinearSVC(class_weight='balanced'))]) knn = Pipeline(steps=[('classifier', KNeighborsClassifier(n_neighbors=4))]) NN = Pipeline(steps=[('classifier', MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1))]) # Test which model generally performs the best with default parameters model_list = [log_reg, clf, rd, svm_class, knn, NN] model_names = [ 'Logistic Regression', 'Decision Tree Classifier', 'Random Forest Classifier', 'SVM', 'KNN', 'MLPC' ] accuracies = [] f1s = [] for i in range(0, len(model_list)): model = model_list[i] model.fit(X_train_scaled, y_train) preds = model.predict(X_test_scaled) accuracies.append(model.score(X_test_scaled, y_test)) f1s.append(f1_score(y_test, preds, average='macro')) print(model_names[i]) print(confusion_matrix(y_test, preds)) comparison_table(model_names, accuracies, f1s) return
def scaler_dummy(dataset,dataset_test): scaler_mm = MinMaxScaler() scaler_ma = MaxAbsScaler() scaler_sd = StandardScaler() scaler_rb = RobustScaler() numerical = list(dataset.columns) data_transform_mm = pd.DataFrame(data = dataset) data_transform_ma = pd.DataFrame(data = dataset) data_transform_sd = pd.DataFrame(data = dataset) data_transform_rb = pd.DataFrame(data = dataset) data_transform_mm[numerical] = scaler_mm.fit_transform(dataset[numerical]) data_transform_ma[numerical] = scaler_ma.fit_transform(dataset[numerical]) data_transform_sd[numerical] = scaler_sd.fit_transform(dataset[numerical]) data_transform_rb[numerical] = scaler_rb.fit_transform(dataset[numerical]) # scaler_mm.fit(dataset[numerical]) # scaler_ma.fit(dataset[numerical]) # scaler_sd.fit(dataset[numerical]) # scaler_rb.fit(dataset[numerical]) data_transform_mm[numerical] = scaler_mm.transform(dataset[numerical]) data_transform_ma[numerical] = scaler_ma.transform(dataset[numerical]) data_transform_sd[numerical] = scaler_sd.transform(dataset[numerical]) data_transform_rb[numerical] = scaler_rb.transform(dataset[numerical]) ## get dummies features_final_mm = pd.get_dummies(data_transform_mm) features_final_ma = pd.get_dummies(data_transform_ma) features_final_sd = pd.get_dummies(data_transform_sd) features_final_rb = pd.get_dummies(data_transform_rb) numerical = list(dataset_test.columns) scaler_mm_fitted_test = scaler_mm.transform(dataset_test[numerical]) scaler_ma_fitted_test = scaler_ma.transform(dataset_test[numerical]) scaler_sd_fitted_test = scaler_sd.transform(dataset_test[numerical]) scaler_rb_fitted_test = scaler_rb.transform(dataset_test[numerical]) scaler_mm_fitted_test = pd.DataFrame(data = scaler_mm_fitted_test,columns=numerical) scaler_ma_fitted_test = pd.DataFrame(data = scaler_ma_fitted_test,columns=numerical) scaler_sd_fitted_test = pd.DataFrame(data = scaler_sd_fitted_test,columns=numerical) scaler_rb_fitted_test = pd.DataFrame(data = scaler_rb_fitted_test,columns=numerical) features_final_mmt = pd.get_dummies(scaler_mm_fitted_test) features_final_mat = pd.get_dummies(scaler_ma_fitted_test) features_final_sdt = pd.get_dummies(scaler_sd_fitted_test) features_final_rbt = pd.get_dummies(scaler_rb_fitted_test) return features_final_mm, features_final_ma, features_final_sd, features_final_rb, features_final_mmt, features_final_mat, features_final_sdt, features_final_rbt
def run_train_test(): print('reading data...') with np.load('data/data.npz') as data: x_all, y_all = data['x'], data['y'] print('transforming data...') y_all[y_all < 0] = 0 # split train and test num_train = int(len(y_all) * 0.8) x_train, y_train = x_all[:num_train], y_all[:num_train] x_test, y_test = x_all[num_train:], y_all[num_train:] perm = np.random.permutation(num_train) x_train, y_train = x_train[perm], y_train[perm] # feature scaling scalar = MaxAbsScaler().fit(x_train) x_train = scalar.transform(x_train) x_test = scalar.transform(x_test) # to DMatrix dtrain = xgb.DMatrix(x_train, label=y_train) dtest = xgb.DMatrix(x_test, label=y_test) print('fitting xgb...') param = dict( objective='binary:logistic', eval_metric=['error', 'auc'], max_depth=7, eta=0.3, # scale_pos_weight=np.sum(y_train == 0) / np.sum(y_train == 1), ) print(param) bst = xgb.train(param, dtrain, num_boost_round=10, evals=[(dtrain, 'train'), (dtest, 'test')]) bst.save_model('data/xgb.model') print('validating...') bst = xgb.Booster() bst.load_model('data/xgb.model') y_pred_proba = bst.predict(dtest) y_pred = y_pred_proba.copy() y_pred[y_pred > 0.5] = 1 y_pred[y_pred <= 0.5] = 0 print('acc:', accuracy_score(y_test, y_pred)) print('auc:', roc_auc_score(y_test, y_pred_proba))
def df_maxabsscale(df): from sklearn.preprocessing import MaxAbsScaler maxabs_scaler = MaxAbsScaler().fit(df) df = pd.DataFrame(maxabs_scaler.transform(df), columns=df.columns) print("DataSet MaxAbsScaled...") df.head() return df
def normalize_data(dataframe, mode): if mode == 'abs': from sklearn.preprocessing import MaxAbsScaler max_abs = MaxAbsScaler(copy=True) #save for retransform later max_abs.fit(dataframe) data_norm = max_abs.transform(dataframe) return data_norm, max_abs if mode == 'robust': from sklearn.preprocessing import RobustScaler robust = RobustScaler(copy=True) #save for retransform later robust.fit(dataframe) data_norm = robust.transform(dataframe) return data_norm, robust if mode == 'min_max': from sklearn.preprocessing import MinMaxScaler minmax = MinMaxScaler(feature_range=(0, 1), copy=True) #save for retransform later minmax.fit(dataframe) data_norm = minmax.transform(dataframe) return data_norm, minmax if mode == 'std': from sklearn.preprocessing import StandardScaler stdscaler = StandardScaler(copy=True, with_mean=True, with_std=True) stdscaler.fit(dataframe) data_norm = stdscaler.transform(dataframe) return data_norm, stdscaler
def scikit_clustering(number_of_clusters=3600): user_features = pickle.load( open(os.path.join(ROOT_DIR, 'users_feature.p'), 'rb')) users_features_vectors = list(user_features.values()) users_dataset = np.array(users_features_vectors) df = pd.DataFrame(users_dataset) df[0] = df[0].astype('category') df[1] = df[1].astype('category') df[3] = df[3].astype('category') df[6] = df[6].astype('category') abs_scaler = MaxAbsScaler() abs_scaler.fit(df[[2, 4, 5, 7, 8, 9]]) df[[2, 4, 5, 7, 8, 9]] = abs_scaler.transform(df[[2, 4, 5, 7, 8, 9]]) print(df.iloc[:, [0]].dtypes[0]) clustering = AgglomerativeClustering(n_clusters=number_of_clusters, affinity=gower.gower_matrix, linkage='complete').fit(df) result = clustering.labels_ clustering_result = {} for i in range(len(result)): if result[i] in clustering_result: clustering_result[result[i]] += [users_features_vectors[i]] else: clustering_result[result[i]] = [users_features_vectors[i]] file_to_write = open('users_vectors_clustering.p', 'wb') pickle.dump(clustering_result, file_to_write)
def scikit_clustering_ver2(number_of_clusters=3600): user_features = pickle.load( open(os.path.join(ROOT_DIR, 'users_feature.p'), 'rb')) users_features_vectors = list(user_features.values()) users_dataset = np.array(users_features_vectors) df = pd.DataFrame(users_dataset) df[0] = df[0].astype('category') df[1] = df[1].astype('category') df[3] = df[3].astype('category') df[6] = df[6].astype('category') abs_scaler = MaxAbsScaler() abs_scaler.fit(df[[2, 4, 5, 7, 8, 9]]) df[[2, 4, 5, 7, 8, 9]] = abs_scaler.transform(df[[2, 4, 5, 7, 8, 9]]) clustering = KMeans(n_clusters=number_of_clusters, verbose=1).fit(df) result = clustering.labels_ logging.info("result: {0}".format(result)) clustering_result = {} for i in range(len(result)): if result[i] in clustering_result: clustering_result[result[i]] += [users_features_vectors[i]] else: clustering_result[result[i]] = [users_features_vectors[i]] file_to_write = open('users_vectors_clustering.p', 'wb') pickle.dump(clustering_result, file_to_write)
def run_isolation_forest(file_path): """ Run test for Isolation forest model :param file_path: the path of the data set :return: isolation forest prediction """ features_list = ['Direction', 'Speed'] df_train = pd.read_csv(f'{file_path}/without_anom.csv') df_train = df_train[features_list] scalar = MaxAbsScaler() X_train = scalar.fit_transform(df_train) random_model = MultiOutputRegressor( RandomForestRegressor(max_depth=2, max_features="sqrt")) # lab_enc = preprocessing.LabelEncoder() # training_scores_encoded = lab_enc.fit_transform(X_train) random_model.fit(X_train, X_train) pred = random_model.predict(X_train) # isolation_model = MultiOutputRegressor(IsolationForest()).fit(X_train) # pred = isolation_model.predict(X_train) test_path = "C:\\Users\\Yehuda Pashay\\Desktop\\fligth_data\\data_set\\test\\chicago_to_guadalajara\\down_attack" df_test = pd.read_csv(f'{test_path}/sensors_8.csv') df_test = df_test[features_list] Y_test = scalar.transform(df_test) test_pred = random_model.predict(Y_test) a = 4
def use_MaxAbsScaler(): # 가장 큰 절대값 사용 : -1 ~ 1 사이로 변환 x = [[1., -1., 5.], [2., 0., -5.], [0., 1., -10]] scaler = MaxAbsScaler() scaler.fit(x) print(scaler.transform(x))
def _scaled(self): ''' 特征归一化,采用 MaxAbsScaler 来进行归一化 :return: ''' print("----- Begin run scaled at %s -------" % current_time()) train_scales = {} test_scales = {} self.scalers = {} for _type in self.types: if _type == 'type 1': train_last_index = 5 #最后5列为 group_1/date_act/date_people/char_38/outcome test_last_index = 4 #最后4列为 group_1/date_act/date_people/char_38 else: train_last_index = 6 #最后6列为 group_1/char_10_act/date_act/date_people/char_38/outcome test_last_index = 5 #最后5列为 group_1/char_10_act/date_act/date_people/char_38 scaler = MaxAbsScaler() train_array = self.train_datas[_type].toarray() train_front = train_array[:, :-train_last_index] train_mid = scaler.fit_transform( train_array[:, -train_last_index:-1]) #outcome 不需要归一化 train_end = train_array[:, -1].reshape((-1, 1)) #outcome train_scales[_type] = np.hstack( (train_front, train_mid, train_end)) test_array = self.test_datas[_type].toarray() test_front = test_array[:, :-test_last_index] test_end = scaler.transform(test_array[:, -test_last_index:]) test_scales[_type] = np.hstack((test_front, test_end)) self.scalers[_type] = scaler self.train_datas = train_scales self.test_datas = test_scales print("----- End run scaled at %s -------" % current_time())
class InputSequenceSharedData: def __init__(self, *, frac_train): print('InputSequenceSharedData: reading data..') with np.load('data/features.npz') as data: self.data_user_ids = data['user_id'] self.num_rows = int(data['num_rows']) self.num_users = int(data['num_users']) with np.load('data/data.npz') as data: self.data_x, self.data_y = data['x'], data['y'] print('InputSequenceSharedData: scaling features...') self.scalar = MaxAbsScaler().fit(self.data_x) self.data_x = self.scalar.transform(self.data_x) print('InputSequenceSharedData: building data structures...') self.num_train = int(self.num_rows * frac_train) self.train_user_rows = [[] for _ in range(self.num_users)] self.test_user_rows = [[] for _ in range(self.num_users)] for row in range(self.num_train): user_id = self.data_user_ids[row] self.train_user_rows[user_id].append(row) for row in range(self.num_train, self.num_rows): user_id = self.data_user_ids[row] self.test_user_rows[user_id].append(row) self.train_user_offsets, self.test_user_offsets = [0], [0] for user_id in range(self.num_users): self.train_user_offsets.append(self.train_user_offsets[-1] + len(self.train_user_rows[user_id])) self.test_user_offsets.append(self.test_user_offsets[-1] + len(self.test_user_rows[user_id])) self.train_user_offsets = self.train_user_offsets[1:] self.test_user_offsets = self.test_user_offsets[1:] print('InputSequenceSharedData: init done')
class ScalingAdder(BaseEstimator, TransformerMixin): def _create_scaler(self, scaler): if scaler == 'std': self._sc = StandardScaler() if scaler == 'minmax': self._sc = MinMaxScaler() if scaler == 'maxabs': self._sc = MaxAbsScaler() def __init__(self, scaler=None): self.scaler = scaler self._create_scaler(scaler) def set_params(self, scaler=None, **parameters): self.scaler = scaler self._create_scaler(scaler) return self def get_params(self, **kwargs): return {"scaler": self.scaler} def transform(self, X, **transform_params): if self.scaler is None: return X if (X.shape[1] > 1): return np.hstack((X[:, :1], self._sc.transform(X[:, 1:]))) return np.hstack((X[:, :1], np.zeros(shape=(X.shape[0], 1)))) def fit(self, X, y=None, **fit_params): if self.scaler is not None: if X.shape[1] > 1: self._sc.fit(X[:, 1:], y) return self
def test_max_abs_scaler(): tform = MaxAbsScaler() tform.fit(X) tform_ = convert_estimator(tform) X_t = tform.transform(X) X_t_ = tform_.transform(X) np.allclose(X_t, X_t_)
def scale(train_datas,test_datas): train_results={} test_results={} types=['type %d'%i for i in range(1,8)] for _type in types: if _type=='type 1': train_last_index=5#最后5列为 group_1/date_act/date_people/char_38/outcome test_last_index=4#最后4列为 group_1/date_act/date_people/char_38 else: train_last_index=6#最后6列为 group_1/char_10_act/date_act/date_people/char_38/outcome test_last_index=5#最后5列为 group_1/char_10_act/date_act/date_people/char_38 scaler=MaxAbsScaler() train_array=train_datas[_type].toarray() train_front=train_array[:,:-train_last_index] train_mid=scaler.fit_transform(train_array[:,-train_last_index:-1])#outcome 不需要归一化 train_end=train_array[:,-1].reshape((-1,1)) #outcome train_results[_type]=np.hstack((train_front,train_mid,train_end)) test_array=test_datas[_type].toarray() test_front=test_array[:,:-test_last_index] test_end=scaler.transform(test_array[:,-test_last_index:]) test_results[_type]=np.hstack((test_front,test_end)) return train_results,test_results
def ml_stratified_cv(): #from sklearn.utils import check_random_state #rng = check_random_state(0) from sklearn.cross_validation import StratifiedKFold from sklearn.preprocessing import MaxAbsScaler scaler = MaxAbsScaler() flag_scale = True cv = StratifiedKFold(y, n_folds=10, shuffle=True) ytrue, ypred, score = [], [], [] for itr, its in cv: Xtr, ytr = X[itr], y[itr] Xts, yts = X[its], y[its] if flag_scale: scaler.fit(Xtr) Xtr = scaler.transform(Xtr) Xts = scaler.transform(Xts) clf.fit(Xtr, ytr) ypr = clf.predict(Xts) sco = clf.decision_function(Xts) ytrue.append(yts) ypred.append(ypr) score.append(sco) ytrue = np.concatenate(ytrue) ypred = np.concatenate(ypred) score = np.concatenate(score) print tw.clf_results_extended(ytrue, score)
def main(arguments): # param = parse_parameters() # get parameters from command display_params(arguments) datasets = [read_datsets(x, arguments['multi']) for x in arguments['input']] # loading datasets as lists of document objects features_list = [x for x in ['tfidf', 'char_grams', 'lexical', 'style', 'readability', 'nela'] if arguments[x]] maxabs_scaler = MaxAbsScaler() features_instance = features(datasets[0]) for i in range(len(datasets)): X = compute_features(datasets[i], features_instance, tfidf=arguments['tfidf'], char_grams=arguments['char_grams'], lexical=arguments['lexical'], style=arguments['style'], readability=arguments['readability'], nela=arguments['nela'] ) if i == 0: # It is the first iteration and we assume this is training X = maxabs_scaler.fit_transform(X) else: X = maxabs_scaler.transform(X) dump_feature_file(X, get_output_file_name(arguments['input'][i], features_list) )
class MaxAbsScalerPrim(primitive): def __init__(self, random_state=0): super(MaxAbsScalerPrim, self).__init__(name='MaxAbsScaler') self.id = 8 self.hyperparams = [] self.type = 'feature preprocess' self.description = "Scale each feature by its maximum absolute value. his estimator scales and translates each feature individually such that the maximal absolute value of each feature in the training set will be 1.0. It does not shift/center the data, and thus does not destroy any sparsity. This scaler can also be applied to sparse CSR or CSC matrices." self.hyperparams_run = {'default': True} self.scaler = MaxAbsScaler() self.accept_type = 'c_t' def can_accept(self, data): return self.can_accept_c(data) def is_needed(self, data): # data = handle_data(data) # Update return True def fit(self, data): data = handle_data(data) self.scaler.fit(data['X']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) for i in range(len(cols)): if not 'one_hot' in cols[i]: cols[i] = "{}_mxabsscale".format(cols[i]) output['X'] = pd.DataFrame(self.scaler.transform(output['X']), columns=cols) final_output = {0: output} return final_output
def normalize_data(X_train, y_train, X_test, name, method='diff'): """Normalize dataset. Please note that it doesn't modify the original dataset, it just returns a new dataset that you can use to modify the original dataset or create a new one. """ if CONFIG.NORMALIZATION['method'] == 'max': scaler = MaxAbsScaler() scaler_y = MaxAbsScaler() elif CONFIG.NORMALIZATION['method'] == 'diff': scaler = MinMaxScaler() scaler_y = MinMaxScaler() elif CONFIG.NORMALIZATION['method'] == 'std': scaler = StandardScaler() scaler_y = StandardScaler() else: raise ValueError( 'Internal Error: Value of CONFIG.NORMALIZATION["method"] should be "max", "diff", "std".' ) aux = X_train X_train = scaler.fit_transform(X_train) X_train = pd.DataFrame(data=X_train, index=aux.index, columns=aux.columns) aux = X_test X_test = scaler.transform(X_test) X_test = pd.DataFrame(data=X_test, index=aux.index, columns=aux.columns) del aux y_train = scaler_y.fit_transform(y_train.values.reshape(-1, 1)) if name == 'LIGHTGBM': y_train = [i[0] for i in y_train] # TODO: to do in a more efficient way return X_train, y_train, X_test, scaler_y
def scaler_dummy(dataset): scaler_mm = MinMaxScaler() scaler_ma = MaxAbsScaler() scaler_sd = StandardScaler() scaler_rb = RobustScaler() numerical = list(dataset.columns) data_transform_mm = pd.DataFrame(data = dataset) data_transform_ma = pd.DataFrame(data = dataset) data_transform_sd = pd.DataFrame(data = dataset) data_transform_rb = pd.DataFrame(data = dataset) scaler_mm.fit(dataset[numerical]) scaler_ma.fit(dataset[numerical]) scaler_sd.fit(dataset[numerical]) scaler_rb.fit(dataset[numerical]) data_transform_mm[numerical] = scaler_mm.transform(dataset[numerical]) data_transform_ma[numerical] = scaler_ma.transform(dataset[numerical]) data_transform_sd[numerical] = scaler_sd.transform(dataset[numerical]) data_transform_rb[numerical] = scaler_rb.transform(dataset[numerical]) ## get dummies features_final_mm = pd.get_dummies(data_transform_mm) features_final_ma = pd.get_dummies(data_transform_ma) features_final_sd = pd.get_dummies(data_transform_sd) features_final_rb = pd.get_dummies(data_transform_rb) return features_final_mm, features_final_ma, features_final_sd, features_final_rb
def normalize_raw_features(X: np.array) -> np.array: """Normalize features if column was not OneHot encoded""" for col in range(X.shape[1]): dense_col = X[:, col].todense() if (dense_col > 1.).any() or (dense_col < 0.).any(): scaler = MaxAbsScaler().fit(dense_col) X[:, col] = csr_matrix(scaler.transform(dense_col)) return X
def scale_data(x_train, x_test): """ We only scale the continuous features. No need to scale binary features """ idx_binary = [] # columns with boolean values for k in range(x_train.shape[1]): idx_binary.append( np.array_equal(x_train[:,k], x_train[:,k].astype(bool)) ) # checking if a column is binary idx_cont = np.logical_not(idx_binary) sc = MaxAbsScaler() sc.fit(x_train[:, idx_cont]) x_train[:, idx_cont] = sc.transform(x_train[:, idx_cont]) x_test[:, idx_cont] = sc.transform(x_test[:, idx_cont]) return
def test_maxabsscaler_vs_sklearn(): # Compare msmbuilder.preprocessing.MaxAbsScaler # with sklearn.preprocessing.MaxAbsScaler maxabsscalerr = MaxAbsScalerR() maxabsscalerr.fit(np.concatenate(trajs)) maxabsscaler = MaxAbsScaler() maxabsscaler.fit(trajs) y_ref1 = maxabsscalerr.transform(trajs[0]) y1 = maxabsscaler.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
def plotPCA(X_train, y_train, X_test, y_test, outdir): #clf = loadClf(term, fold, clfName) #try: # decision = clf.decision_function # Vf = numpy.arange(-1.,1.1,0.1) # V = (0.,) #except AttributeError: # decision = lambda x:clf.predict_proba(x)[:,0] # Vf = numpy.arange(0.,1.05,0.05) # V = (0.5,) scaler = MaxAbsScaler(copy=False) target_names = ("Positive","Negative") term = outdir.parent.name.replace("_", " ") pca = PCA(n_components=2) pca.fit(X_train) scaler.fit(pca.transform(X_train)) #delta = 0.025 #a=numpy.arange(-1., 1., delta) #b=numpy.arange(-1., 1., delta) #A,B = numpy.meshgrid(a,b) #C=numpy.empty(A.shape) for X, y, n in ((X_train, y_train, 'training'), (X_test, y_test, 'testing')): X_r = scaler.transform(pca.transform(X)) inlier = (numpy.abs(X_r[:,0]) <= 1) & (numpy.abs(X_r[:,1]) <= 1) #print(X_r) plt.clf() #for k,l in product(range(len(a)),range(len(b))): # C[k][l] = decision(pca.inverse_transform(scaler.inverse_transform(((A[k][l],B[k][l]),)))) #print(C) #cfp = plt.contourf(A,B,C,Vf,cmap=plt.cm.bone) #cfp.cmap.set_under('black') #cfp.cmap.set_over('white') #plt.contour(A,B,C,V,colors=("b",)) #y=clf.predict(X) for c, i, target_name in zip("rg", (0, 1), target_names): plt.scatter(X_r[(y == i) & inlier, 0], X_r[(y == i) & inlier, 1], c = c, label = target_name, marker = ",", s = 1,#0.8,#1/numpy.sqrt(2), #edgecolors='none', linewidth = 0, alpha = 0.7) plt.legend() plt.title('PCA for %s on %s data' % (term, n)) plt.savefig(str(outdir/('pca-%s.png' % (n,)))) plt.savefig(str(outdir/('pca-%s.ps' % (n,))))
def _train_test_split(): # Build the store_weather dataframe store_weather_filename = Config.save_dir + "store_weather.pkl" if os.path.exists(store_weather_filename): store_weather = utils.from_pickle(store_weather_filename) else: store_weather = _preprocess_data() # Split train test for each store train = pd.DataFrame({}) test = pd.DataFrame({}) store_ids = store_weather.store_id_bk.unique() for sid in store_ids: c_store = store_weather[store_weather.store_id_bk == sid] s_train = c_store[:-Config.test_size] s_test = c_store[-Config.test_size:] train = train.append(s_train).reset_index().drop(["index"], axis=1) test = test.append(s_test).reset_index().drop(["index"], axis=1) # Scale numeric columns num_cols = ["p_total_revenue", "p_total_volume", "mean_temp", "total_precipitation", "total_snow"] scaler = MaxAbsScaler().fit(train.loc[:, num_cols]) train.loc[:, num_cols] = scaler.transform(train.loc[:, num_cols]) test.loc[:, num_cols] = scaler.transform(test.loc[:, num_cols]) # Scale 2 output columns revenue_scale = MaxAbsScaler().fit(train.loc[:, ["total_revenue"]]) volume_scale = MaxAbsScaler().fit(train.loc[:, ["total_volume"]]) train.loc[:, ["total_revenue"]] = revenue_scale.transform( train.loc[:, ["total_revenue"]]) test.loc[:, ["total_revenue"]] = revenue_scale.transform( test.loc[:, ["total_revenue"]]) train.loc[:, ["total_volume"]] = volume_scale.transform( train.loc[:, ["total_volume"]]) test.loc[:, ["total_volume"]] = volume_scale.transform( test.loc[:, ["total_volume"]]) # Save the train/test dataframes to pickle objects utils.to_pickle(Config.save_dir + "train_set.pkl", train) utils.to_pickle(Config.save_dir + "test_set.pkl", test) # Save the 2 scaler for later use utils.to_pickle(Config.save_dir + "revenue_scale", revenue_scale) utils.to_pickle(Config.save_dir + "volume_scale", volume_scale) # Save store_ids utils.to_pickle(Config.save_dir + "store_id.pkl", store_ids) return train, test
1066, 1053, 1339, 1040, 497, 253, 1485, 337, 1347, 1343, 122, 980, 87, 126, 528, 694, 1444, 655, 161, 626, 545, 906, 1235, 684, 263, 69, 882, 1209, 180, 1386, 1074, 631, 908, 1176, 947, 401, 1085, 1029, 797, 1107, 386, 559, 588, 522, 644, 614, 1440, 1140, 1267, 1475, 217, 1201, 456, 231, 1079, 1224, 1036, 156, 852, 1384, 1288, 243, 760, 1071] # 6. Zmiana na numpy.array train_index = np.asarray(A) test_index = np.asarray(B) # 7. Podział danych X_train, X_test = raw_X[train_index], raw_X[test_index] y_train, y_test = raw_y[train_index], raw_y[test_index] # 8. Normalizacja X_train_norm = normalizer.fit_transform(X_train) X_test_norm = normalizer.transform(X_test) # 9. Nauka algorytmu clf = SVC(kernel='rbf', C=1, gamma=0.5, coef0=0.0) clf.fit(X_train_norm, y_train) pred = clf.predict(X_test_norm) acc = accuracy_score(pred, y_test) # 10. Wynik print "Accuracy:",acc # 11. Zapisanie modelu with open("model.pickle", "wb") as f: pickle.dump((clf, normalizer), f, 2);
def normalize_features(X: np.array) -> np.array: """Normalize features by scaling to [0,1]""" scaler = MaxAbsScaler().fit(X) return scaler.transform(X)
import pandas as pd from sklearn.cross_validation import train_test_split from sklearn.preprocessing import MaxAbsScaler from sklearn.neighbors import KNeighborsClassifier # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR') training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25) result1 = tpot_data.copy() # Use Scikit-learn's MaxAbsScaler to scale the features training_features = result1.loc[training_indices].drop('class', axis=1) if len(training_features.columns.values) > 0: scaler = MaxAbsScaler() scaler.fit(training_features.values.astype(np.float64)) scaled_features = scaler.transform(result1.drop('class', axis=1).values.astype(np.float64)) result1 = pd.DataFrame(data=scaled_features) result1['class'] = result1['class'].values else: result1 = result1.copy() # Perform classification with a k-nearest neighbor classifier knnc2 = KNeighborsClassifier(n_neighbors=min(10, len(training_indices))) knnc2.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values) result2 = result1.copy() result2['knnc2-classification'] = knnc2.predict(result2.drop('class', axis=1).values)
def _max_abs_scaler(column): sc = MaxAbsScaler() sc.fit(column.reshape(-1,1)) new_col = sc.transform(column.reshape(-1,1)) return(new_col)
model.add(Activation('relu')) model.add(GaussianNoise(0.00001)) model.add(Dropout(0.3)) model.add(MaxoutDense(1, input_dim=100)) model.add(Activation('sigmoid')) #ada = Adagrad(lr=0.001) ada = SGD(lr=0.0003, momentum=0.9, decay=0.0001, nesterov=True) model.compile(optimizer=ada, loss='binary_crossentropy', metrics=['accuracy']) scaler = MaxAbsScaler() train_train_scaled = scaler.fit_transform(train_train[features]) train_test_scaled = scaler.transform(train_test[features]) model.fit(train_train_scaled, train_train.target.values, nb_epoch=150, batch_size=100) train_train_pred = model.predict(train_train_scaled, batch_size=100) train_test_pred = model.predict(train_test_scaled, batch_size=100) train_score = log_loss(train_train.target.values, train_train_pred) test_score = log_loss(train_test.target.values, train_test_pred) #test_poly = poly.transform(test[features]) test_scaled = scaler.transform(test[features]) test_pred = model.predict(test_scaled, batch_size=100) ensemble_train.loc[train_test.index, 'nn'] = train_test_pred submission.loc[:, 'm_{}'.format(ind)] = test_pred
def main(): X, y = get_data('../../data/train.csv') sclr = MaxAbsScaler() X = sclr.fit_transform(X) # pickle.dump(sclr, open('./dumps/scaler_pickle', 'wb+')) X_test, y_test = get_data('../../data/val.csv') X_test = sclr.transform(X_test) X_fin, y_fin = get_data('../../data/test.csv') X_fin = sclr.transform(X_fin) other, yo = get_data('../../data/other.csv') other = sclr.transform(other) lin = linear_model.LogisticRegression( C=10000, ) # selector = RFE(lin, 21, step=1) # selector.fit(X, y) # X = selector.transform(X) # X_test = selector.transform(X_test) # X_fin = selector.transform(X_fin) # for i in range(len(selector.support_)): # print i+1, selector.support_[i] lin.fit(X, y) # pickle.dump(lin, open('./dumps/lin_reg_pickle', 'wb+')) x1 = lin.predict_proba(X) x1_test = lin.predict_proba(X_test) # x1_fin = lin.predict_proba(X_fin) # o1 = lin.predict_proba(other) print 'lin' print metrics.classification_report(y, lin.predict(X)) print metrics.classification_report(y_test, lin.predict(X_test)) print metrics.classification_report(y_fin, lin.predict(X_fin)) roc = lin.predict_proba(X_fin) # r = lin.predict(X_test) # l1 = [] # l2 = [] # for i in range(len(roc)): # if max(roc[i]) > 0.5: # l1.append(y_fin[i]) # l2.append(r[i]) # print 'dsfasdfasd' # print metrics.classification_report(l1, l2) # return fpr_grd0, tpr_grd0, _ = metrics.roc_curve(y_fin, roc[:, 0], pos_label=0) fpr_grd1, tpr_grd1, _ = metrics.roc_curve(y_fin, roc[:, 1], pos_label=1) fpr_grd2, tpr_grd2, _ = metrics.roc_curve(y_fin, roc[:, 2], pos_label=2) plt.plot(fpr_grd0, tpr_grd0, label='NRP') plt.plot(fpr_grd1, tpr_grd1, label='RiPP') plt.plot(fpr_grd2, tpr_grd2, label='Polyketide') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') plt.legend(loc='best') plt.show() # print lin.coef_ # print sum(lin.predict_proba(X_test)[0]) svm_model = SVC( C=5000, # kernel='linear', # degree=2, coef0=100, # probability=True, # shrinking=True, # class_weight='balanced', probability=True, # decision_function_shape='ovr' ) svm_model.fit(X, y) x2 = svm_model.predict_proba(X) x2_test = svm_model.predict_proba(X_test) x2_fin = svm_model.predict_proba(X_fin) o2 = svm_model.predict_proba(other) print 'svm' print metrics.classification_report(y, svm_model.predict(X)) print metrics.classification_report(y_test, svm_model.predict(X_test))