def cv_mean_std_array(X, y, alphas, ks, n_a, n_k, cv=20): n = n_alphas*n_ks cv_mean = np.empty(n) cv_std = np.empty(n) regressors = pd.DataFrame() binarizer = Binarizer(threshold=1400) y_binary = binarizer.transform(y).transpose().ravel() itt_counter = 0 print 'size n_a: %d n_k: %d' %(n_a, n_k) for i in range (0, n_a): print 'reg. column : %d' %(i*n_k) temp_string = 'alpha=%f' %alphas[i*n_k] print temp_string print regressors.shape df_temp = pd.DataFrame() print 'computing for alpha = %f' %(alphas[n_ks*i]) X_lasso, df_temp[temp_string] = df_Lasso(X, y, alphas[i*n_k]) regressors = pd.concat([regressors,df_temp], ignore_index=True, axis=1) for j in range(0, n_k): print 'i:%d, j:%d' %(i, j) print 'computing for alpha = %f and k = %f' %(alphas[n_ks*i+j], ks[n_ks*i+j]) print 'X_lasso shape:' print X_lasso.shape cv_mean[n_ks*i+j], cv_std[n_ks*i+j] = knn_cv_mean_and_std(X_lasso, y_binary, alphas[n_ks*i+j], ks[n_ks*i+j], cv=cv) itt_counter = itt_counter + 1 print 'completed %dth iteration of knn cv mean:%f std:%f, at pos:%d' % (itt_counter, cv_mean[n_ks*i+j], cv_std[n_ks*i+j], n_ks*i+j) return cv_mean, cv_std, regressors
def initialize(): images, labels = load_mnist_data() binarizer = Binarizer().fit(images) images_binarized = binarizer.transform(images) knn = KNeighborsClassifier(n_neighbors=3, metric='jaccard') knn.fit(images_binarized, labels) return knn
def cv_mean_std_array(X, y, alphas, n_a, cv=20): binarizer = Binarizer(threshold=1400) y_binary = binarizer.transform(y).transpose().ravel() cv_ols_means, cv_ols_stds, cv_lasso_means, cv_lasso_stds, cv_ridge_means, cv_ridge_stds = np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a) for i in range (0, n_a): print 'computing for alpha=%f' %alphas[i] cv_ols_means[i], cv_ols_stds[i], cv_lasso_means[i], cv_lasso_stds[i], cv_ridge_means[i], cv_ridge_stds[i] = lm_cv_mean_and_std(X, , alphas[i]) print 'successfully computed iteration %d' %i return cv_ols_means, cv_ols_stds, cv_lasso_means, cv_lasso_stds, cv_ridge_means, cv_ridge_stds
def binarizeMatrix(dataMatrix, threshold): """ Transforms all the inputs to either 0/1 . <0 Maps to 0. >1 Maps 1. [0,1] depends on the threshold you set between [0,1] """ binarizer = Binarizer(threshold=threshold) dataMatrix = binarizer.fit_transform(dataMatrix) return dataMatrix
def test_binarizer(): X_ = np.array([[1, 0, 5], [2, 3, 0]]) for init in (np.array, sp.csr_matrix, sp.csc_matrix): X = init(X_.copy()) binarizer = Binarizer(threshold=2.0, copy=True) X_bin = toarray(binarizer.transform(X)) assert_equal(np.sum(X_bin == 0), 4) assert_equal(np.sum(X_bin == 1), 2) X_bin = binarizer.transform(X) assert_equal(type(X), type(X_bin)) binarizer = Binarizer(copy=True).fit(X) X_bin = toarray(binarizer.transform(X)) assert_true(X_bin is not X) assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4) binarizer = Binarizer(copy=True) X_bin = binarizer.transform(X) assert_true(X_bin is not X) X_bin = toarray(X_bin) assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4) binarizer = Binarizer(copy=False) X_bin = binarizer.transform(X) assert_true(X_bin is X) X_bin = toarray(X_bin) assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4)
def test_binarizer_vs_sklearn(): # Compare msmbuilder.preprocessing.Binarizer # with sklearn.preprocessing.Binarizer binarizerr = BinarizerR() binarizerr.fit(np.concatenate(trajs)) binarizer = Binarizer() binarizer.fit(trajs) y_ref1 = binarizerr.transform(trajs[0]) y1 = binarizer.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
def wine_quality_white(): # white wine quality dataset filename = '../../data/raw/mldata/winequality-white.csv' # The data corresponds to the 11 first column of the csv file data = np.loadtxt(filename, usecols=tuple(range(11)), delimiter=';', dtype=float) # Read the label # We need to binarise the label using a threshold at 4 bn = Binarizer(threshold=4) label = bn.fit_transform(np.loadtxt(filename, usecols=(11,), delimiter=';', dtype=int)) # We need to inverse the label -> 1=0 and 0=1 label = np.ravel(np.abs(label - 1)) np.savez('../../data/clean/uci-wine-quality-white.npz', data=data, label=label)
def fit(self, X, y=None): """ Обучает бинаризатор на данных """ # print("Fitting binarizer...") methods = Binarizer._UNSUPERVISED_METHODS + Binarizer._SUPERVISED_METHODS if self.method not in methods: raise ValueError("Method should be one of {0}".format(", ".join(methods))) X = check_array(X, accept_sparse=['csr', 'csc']) if issparse(X): X = X.tocsc() if self.method in Binarizer._UNSUPERVISED_METHODS: self._fit_unsupervised(X) self.joint_thresholds_ = self.thresholds_ self.joint_scores_ = self.scores_ else: if y is None: raise ValueError("y must not be None for supervised binarizers.") # вынести в отдельную функцию # y = np.array(y) # if len(y.shape) == 1: # self.classes_, y = np.unique(y, return_inverse=True) # nclasses = self.classes_.shape[0] # Y_new = np.zeros(shape=(y.shape[0], nclasses), dtype=int) # Y_new[np.arange(y.shape[0]), y] = 1 # else: # self.classes_ = np.arange(y.shape[1]) # Y_new = y label_binarizer = SK_LabelBinarizer() Y_new = label_binarizer.fit_transform(y) self.classes_ = label_binarizer.classes_ if X.shape[0] != Y_new.shape[0]: raise ValueError("X and y have incompatible shapes.\n" "X has %s samples, but y has %s." % (X.shape[0], Y_new.shape[0])) self._fit_supervised(X, Y_new) if len(self.classes_) <= 2: self.joint_thresholds_ = self.thresholds_[:, 0] self.joint_scores_ = self.scores_[:, 0] else: min_class_scores = np.min(self.scores_, axis=0) max_class_scores = np.max(self.scores_, axis=0) diffs = max_class_scores - min_class_scores diffs[np.where(diffs == 0)] = 1 normalized_scores = (self.scores_ - min_class_scores) / diffs # находим для каждого признака тот класс, для которого он наиболее полезен # НАВЕРНО, МОЖНО СДЕЛАТЬ ПО_ДРУГОМУ optimal_indexes = np.argmax(normalized_scores, axis=1) nfeat = self.thresholds_.shape[0] # в качестве порога бинаризации каждого признака # берём значение для класса, где он наиболее полезен self.joint_thresholds_ = self.thresholds_[np.arange(nfeat), optimal_indexes] self.joint_scores_ = self.scores_[np.arange(nfeat), optimal_indexes] # передаём пороги в sklearn.SK_Binarizer self.binarize_transformer_ = SK_Binarizer(self.joint_thresholds_) return self
def do_logreg(): from sklearn.preprocessing import Binarizer, scale from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score,classification_report from sklearn.cross_validation import train_test_split from sklearn.cross_validation import cross_val_score from sklearn.grid_search import GridSearchCV from scipy.stats import expon import pandas ### load data col_names=['mpg','cylinders','displacement','horsepower','weight', 'acceleration','model_year','origin','car_name'] df=pandas.read_csv('auto_mpg.csv') df.columns=col_names df=df.drop('car_name',1) lr=LogisticRegression() bn=Binarizer(threshold=df['mpg'].mean()) print "Performing binarization of the mpg variable into above/below average classes" target=bn.fit_transform(df['mpg']) data=df.drop('mpg',1) data=scale(data) print "Splitting into training and test sets" data_train,data_test,target_train,target_test=train_test_split(data,target,test_size=0.5,random_state=0) grid=[0.001, 0.01, 0.1, 1, 10, 100, 1000] print 'Searching for optimal C in {} using {}-fold validation on test set '.format(grid,nfolds) tuned_parameters=[{'C':grid}] clf=GridSearchCV(lr,tuned_parameters,cv=nfolds,scoring='accuracy') clf.fit(data_train,target_train) for params, mean_score,_ in clf.grid_scores_: print "{}: Mean accuracy {}".format(params,mean_score) print """Cross-validating above/below average mpg prediction using {}-fold validation on the test dataset. Using the best estimator: {} """.format(nfolds,clf.best_estimator_) mean_cross=np.mean(cross_val_score(clf.best_estimator_,data_test,target_test,cv=nfolds)) print "Mean cross-validated accuracy after optimization is: {}".format(mean_cross)
def us_crime(): # US crime dataset filename = '../../data/raw/mldata/communities.data' # The missing data will be consider as NaN # Only use 122 continuous features tmp_data = np.genfromtxt(filename, delimiter = ',') tmp_data = tmp_data[:, 5:] # replace missing value by the mean imp = Imputer(verbose = 1) tmp_data = imp.fit_transform(tmp_data) # extract the data to be saved data = tmp_data[:, :-1] bn = Binarizer(threshold=0.65) label = np.ravel(bn.fit_transform(tmp_data[:, -1])) np.savez('../../data/clean/uci-us-crime.npz', data=data, label=label)
return loss_value, tape.gradient(loss_value, model.trainable_variables) if __name__ == '__main__': here = os.path.dirname(os.path.abspath(__file__)) #Import mnist dataset mnist = tf.keras.datasets.mnist #Split in test and train (x_train, y_train), (x_test, y_test) = mnist.load_data() #Scale entries between(0,1) x_train = x_train/255 x_test = x_test/255 #Binarize pictures binarizer = Binarizer(threshold=0.5) x_train_binary = np.array([binarizer.fit_transform(slice) for slice in x_train]) x_test_binary = np.array([binarizer.fit_transform(slice) for slice in x_test]) #reshape pictures to be vectors and fix datatype x_train_binary = x_train_binary.reshape(x_train_binary.shape[0],-1).astype(np.float32) x_test_binary = x_test_binary.reshape(x_test_binary.shape[0],-1).astype(np.float32) one_hot_labels = np.zeros((60000,10)) one_hot_labels[np.arange(60000), y_train] = 1 dataset_train = tf.data.Dataset.from_tensor_slices(x_train_binary) dataset_train_labels = tf.data.Dataset.from_tensor_slices(y_train) dcombined = tf.data.Dataset.zip((dataset_train, dataset_train_labels)).batch(32)
# -*- encoding: utf-8 -*- ''' Created on 2016年5月22日 @author: LuoPei ''' #整体并行 from numpy import log1p # from numpy import log from sklearn.preprocessing import FunctionTransformer from sklearn.preprocessing import Binarizer from sklearn.pipeline import FeatureUnion #新建将整体特征矩阵进行对数函数转换的对象 # step2_1=('ToLog',FunctionTransformer(loglp)) step2_1 = ('ToLog', FunctionTransformer(log1p)) #新建将整体特征矩阵进行二值化类的对象 step2_2 = ('ToBinary', Binarizer()) #新建整体并行处理对象 #该对象也有fit和transform 方法,fit和transform 方法均是并行地调用需要并行处理的对象的fit和transform 方法 #参数transformer_list为需要并行处理的对象列表,该列表为二元组列表,第一元为对象的名称,第二元为对象 step2 = ('FeatureUnion', FeatureUnion(transformer_list=[step2_1, step2_2])) if __name__ == "__main__": pass
if __name__ == '__main__': #Import mnist dataset mnist = tf.keras.datasets.mnist #Split in test and train (x_train, y_train), (x_test, y_test) = mnist.load_data() #Scale entries between(0,1) x_train = x_train / 255 x_test = x_test / 255 #Binarize pictures binarizer = Binarizer(threshold=0.5) x_train_binary = np.array( [binarizer.fit_transform(slice) for slice in x_train]) x_test_binary = np.array( [binarizer.fit_transform(slice) for slice in x_test]) #reshape pictures to be vectors and fix datatype x_train_binary = x_train_binary.reshape(x_train_binary.shape[0], -1).astype(np.float32) x_test_binary = x_test_binary.reshape(x_test_binary.shape[0], -1).astype(np.float32) ''' here = os.path.dirname(os.path.abspath(__file__)) datah5 = dd.io.load(here + '/data/ising/ising_data_L32.hdf5')
#--------------------------------------------------------------------------------------- # # Comment section below out if you already have made pickle files # #--------------------------------------------------------------------------------------- all_bigr = ngram(X_train, 'bigram') #starting with all features print "Starting counting bigrams..." X_train_bi_counted = count(X_train, all_bigr, 'bigram') print "Done counting train set" X_test_bi_counted = count(X_test, all_bigr, 'bigram') print "Done counting test set" print "Binarizing and dumping files" bin = Binarizer() X_train_bi_binary = bin.fit_transform(X_train_bi_counted) X_test_bi_binary = bin.transform(X_test_bi_counted) pickle.dump(X_train_bi_binary, open( "X_train_bi_binary.p", "wb" ) ) pickle.dump(X_test_bi_binary, open( "X_test_bi_binary.p", "wb" ) ) print "Done" print "Starting tfidf vectors..." X_train_bi_tfidf, X_test_bi_tfidf = tfidf(X_train_bi_counted, X_test_bi_counted) pickle.dump(X_train_bi_tfidf, open( "X_train_bi_tfidf.p", "wb" ) ) pickle.dump(X_test_bi_tfidf, open( "X_test_bi_tfidf.p", "wb" ) ) print "Done" print "Starting feature selection using CART random forests on binary files"
news_data = extracted_data.iloc[:, :-1] # Take up to the second last column news_labels = extracted_data[' shares'] # Take shares column for labels # Data Preprocessing news_data_transpose = news_data.transpose() data_into_dict = news_data_transpose.to_dict() list_data = [v for k, v in data_into_dict.iteritems()] # Encode from sklearn.feature_extraction import DictVectorizer dv = DictVectorizer() transformed_data = dv.fit_transform(list_data).toarray() # Label Encoder - Binarization from sklearn.preprocessing import Binarizer binarizer = Binarizer(threshold=1400) # Threshold at 1400 because median of shares is 1400 transformed_labels = binarizer.transform(news_labels) transformed_labels = transformed_labels.transpose().ravel() # .ravel() is to fix "Too many array indices error" # Could be a scikit or pandas bug ############## Classification ################# from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LinearRegression from sklearn.svm import SVC # Decision Tree Classifier tree = DecisionTreeClassifier() knn = KNeighborsClassifier() gnb = GaussianNB()
sd_scaler = StandardScaler() X_sd_sc = sd_scaler.fit_transform(X) # ndarray print(X_sd_sc) # (3)标准化数据 (normalize data):适合处理稀疏数据 # 距离变为1(单位化?)。。。"归一元"处理 # 将每一行的数据的距离处理成1的数据??? # 使用权重输入的神经网络 和 使用距离的KNN from sklearn.preprocessing import Normalizer # 范数 nm_scaler = Normalizer(norm='l2') X_nm_sc = nm_scaler.fit_transform(X) print(X_nm_sc) # (4)二值数据 (binarize data) :生成明确值或增加属性 # 大于阈值设为1,小于阈值设为0 # 在生成明确值或增加属性的时候使用 from sklearn.preprocessing import Binarizer bizer = Binarizer(threshold= 0.0) X_b_sc = bizer.fit_transform(X) print(X_b_sc) #%% 第九章 数据特征选定 # 特征选择(4个方法) # 通过sklearn来自动选择用于机器学习模型的数据特征的方法 # 减少无关的、冗余的,可以提高算法精度及训练时间 #(1)单变量特征选定 # ============================================================================= # 统计分析可以用来分析选择"对结果影响最大"的数据特征 # sklearn中提供了SelectKBest类,可以实现卡方检验 # 卡方检验是检验定性自变量对定性因变量的相关性的方法 # 假设自变量N种取值,应变量M种取值, # 考虑x=i&y=j的样本频数的观察值与期望值的差距,构建统计量
import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Binarizer # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.8333333333333333 exported_pipeline = make_pipeline( Binarizer(threshold=0.1), LogisticRegression(C=20.0, dual=True, penalty="l2")) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
# Binarization from sklearn.preprocessing import Binarizer import pandas as pd filename = 'indians-diabetes.data.csv' names = [ 'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class' ] dataframe = pd.read_csv(filename, names=names) array = dataframe.values # separate array into input and output components X = array[:, 0:8] Y = array[:, 8] binarizer = Binarizer(threshold=5) binaryX = binarizer.fit_transform(X) # summarize transformed data print(binaryX[0:30, :])
dataSizeStatisticsTest = zeros(numCombinations, dtype=float) varSizeStatisticsTrain = zeros(numCombinations, dtype=float) varSizeStatisticsTest = zeros(numCombinations, dtype=float) a = 0 mnist = fetch_mldata('MNIST original') # split a training set and a test set y_train, y_test = mnist.target[:60000], mnist.target[60000:70000] #vectorizer = CountVectorizer(binary=True) X_both = mnist.data binarizer = Binarizer().fit(50,X_both) X_both = binarizer.transform(X_both) X_train = X_both[:60000] X_test = X_both[60000:70000] #print X_train[1] #ch2 = SelectKBest(chi2, 750) #X_train = ch2.fit_transform(X_train, y_train) #X_test = ch2.transform(X_test) data_train = X_train m,n = data_train.shape print m," ",n
def test_binarizer(): X_ = np.array([[1, 0, 5], [2, 3, -1]]) for init in (np.array, list, sparse.csr_matrix, sparse.csc_matrix): X = init(X_.copy()) binarizer = Binarizer(threshold=2.0, copy=True) X_bin = toarray(binarizer.transform(X)) assert_equal(np.sum(X_bin == 0), 4) assert_equal(np.sum(X_bin == 1), 2) X_bin = binarizer.transform(X) assert_equal(sparse.issparse(X), sparse.issparse(X_bin)) binarizer = Binarizer(copy=True).fit(X) X_bin = toarray(binarizer.transform(X)) assert_true(X_bin is not X) assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4) binarizer = Binarizer(copy=True) X_bin = binarizer.transform(X) assert_true(X_bin is not X) X_bin = toarray(X_bin) assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4) binarizer = Binarizer(copy=False) X_bin = binarizer.transform(X) if init is not list: assert_true(X_bin is X) X_bin = toarray(X_bin) assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4) binarizer = Binarizer(threshold=-0.5, copy=True) for init in (np.array, list): X = init(X_.copy()) X_bin = toarray(binarizer.transform(X)) assert_equal(np.sum(X_bin == 0), 1) assert_equal(np.sum(X_bin == 1), 5) X_bin = binarizer.transform(X) # Cannot use threshold < 0 for sparse assert_raises(ValueError, binarizer.transform, sparse.csc_matrix(X))
'LR': LogisticRegression, 'LSVC' : LinearSVC, 'SVC' : SVC } #%% os.chdir(workspace) dev_idvs_all = numpy.nan_to_num(numpy.load(dev_filename + ".npy")) val_idvs_all = numpy.nan_to_num(numpy.load(val_filename + ".npy")) dev_dvs = numpy.nan_to_num(numpy.load(dev_filename + "_dvs.npy")) val_dvs = numpy.nan_to_num(numpy.load(val_filename + "_dvs.npy")) binarizer = Binarizer(copy=True, threshold=thresh) imputer = Imputer(copy = False) dev_dvs_binary = binarizer.transform(dev_dvs).reshape((dev_dvs.shape[0],)) val_dvs_binary = binarizer.transform(val_dvs).reshape((val_dvs.shape[0],)) """ from statsmodels.regression import quantile_regression dev_idvs2 = dev_idvs[:10000,:] inds = [i for i in xrange(dev_idvs2.shape[1]) if len(unique(dev_idvs2[:,i])) > 1] dev_dvs2 = dev_dvs[:10000,:].reshape((10000,)) model = quantile_regression.QuantReg(dev_dvs2, dev_idvs2) model.fit() """
class Binarizer(TransformerMixin): """ Реализует различные стратегии бинаризации признаков, вычисляя оптимальные пороги и производя бинаризацию с данными порогами Аргументы: ---------- method: str('random', 'log_odds' or 'bns'), метод бинаризации признаков divide_to_bins: bool(optional, default=True), индикатор приведения количественных признаков к целочисленным bins_number: int(optional, default=10), число возможных значений целочисленных признаков при бинаризации """ _UNSUPERVISED_METHODS = ['random'] _SUPERVISED_METHODS = ['log_odds', 'bns'] _CONTINGENCY_METHODS = ['log_odds', 'bns'] def __init__(self, method, divide_to_bins=True, bins_number=10): self.method = method self.divide_to_bins = divide_to_bins self.bins_number = bins_number def fit(self, X, y=None): """ Обучает бинаризатор на данных """ # print("Fitting binarizer...") methods = Binarizer._UNSUPERVISED_METHODS + Binarizer._SUPERVISED_METHODS if self.method not in methods: raise ValueError("Method should be one of {0}".format(", ".join(methods))) X = check_array(X, accept_sparse=['csr', 'csc']) if issparse(X): X = X.tocsc() if self.method in Binarizer._UNSUPERVISED_METHODS: self._fit_unsupervised(X) self.joint_thresholds_ = self.thresholds_ self.joint_scores_ = self.scores_ else: if y is None: raise ValueError("y must not be None for supervised binarizers.") # вынести в отдельную функцию # y = np.array(y) # if len(y.shape) == 1: # self.classes_, y = np.unique(y, return_inverse=True) # nclasses = self.classes_.shape[0] # Y_new = np.zeros(shape=(y.shape[0], nclasses), dtype=int) # Y_new[np.arange(y.shape[0]), y] = 1 # else: # self.classes_ = np.arange(y.shape[1]) # Y_new = y label_binarizer = SK_LabelBinarizer() Y_new = label_binarizer.fit_transform(y) self.classes_ = label_binarizer.classes_ if X.shape[0] != Y_new.shape[0]: raise ValueError("X and y have incompatible shapes.\n" "X has %s samples, but y has %s." % (X.shape[0], Y_new.shape[0])) self._fit_supervised(X, Y_new) if len(self.classes_) <= 2: self.joint_thresholds_ = self.thresholds_[:, 0] self.joint_scores_ = self.scores_[:, 0] else: min_class_scores = np.min(self.scores_, axis=0) max_class_scores = np.max(self.scores_, axis=0) diffs = max_class_scores - min_class_scores diffs[np.where(diffs == 0)] = 1 normalized_scores = (self.scores_ - min_class_scores) / diffs # находим для каждого признака тот класс, для которого он наиболее полезен # НАВЕРНО, МОЖНО СДЕЛАТЬ ПО_ДРУГОМУ optimal_indexes = np.argmax(normalized_scores, axis=1) nfeat = self.thresholds_.shape[0] # в качестве порога бинаризации каждого признака # берём значение для класса, где он наиболее полезен self.joint_thresholds_ = self.thresholds_[np.arange(nfeat), optimal_indexes] self.joint_scores_ = self.scores_[np.arange(nfeat), optimal_indexes] # передаём пороги в sklearn.SK_Binarizer self.binarize_transformer_ = SK_Binarizer(self.joint_thresholds_) return self def transform(self, X): """ Применяем бинаризатор к данным """ print("Transforming binarizer...") if hasattr(self, 'binarize_transformer_'): return self.binarize_transformer_.transform(X) else: raise ValueError("Transformer is not fitted") def _fit_unsupervised(self, X): """ Управляющая функция для методов подбора порога без учителя """ if self.method == 'random': # случайные пороги и полезности if issparse(X): minimums = X.min(axis=0).toarray() maximums = X.max(axis=0).toarray() else: minimums = np.min(X, axis=0) maximums = np.max(X, axis=0) random_numbers = np.random.rand(X.shape[1], 1).reshape((X.shape[1],)) self.thresholds_ = minimums + (maximums - minimums) * random_numbers self.scores_ = np.random.rand(X.shape[1], 1).reshape((X.shape[1],)) return self def _fit_supervised(self, X, y): """ Выполняет подбор порогов с учителем """ # приводим X к целочисленным значениям, если нужно if self.divide_to_bins: bin_divider = BinDivider(bins_number=self.bins_number) X = bin_divider.fit_transform(X) thresholds, scores = [], [] for i in range(X.shape[1]): threshold, score = self._find_optimal_thresholds(X[:, i], y) thresholds.append(threshold) scores.append(score) self.thresholds_ = np.asarray(thresholds, dtype=np.float64) self.scores_ = np.asarray(scores, dtype=np.float64) return self def _find_optimal_thresholds(self, column, y): """ Вычисляет пороги для бинаризации Аргументы: ---------- column: array-like, shape=(nobj,), колонка значений признаков y: array-like, shape=(nobj, nclasses), 0/1-матрица классов """ classes_number = y.shape[1] # вычисляем частоты встречаемости признаков для разных классов values, counts = \ _collect_column_statistics(column, y, classes_number=classes_number, precision=6) if self.method in Binarizer._CONTINGENCY_METHODS: # бинарная классификация if classes_number <= 2: counts = [counts] else: summary_counts = np.sum(counts, axis=1) counts = [np.array((summary_counts - counts[:, i], counts[:, i])).T for i in np.arange(classes_number)] best_thresholds = [None] * len(counts) best_scores = [None] * len(counts) for i in np.arange(len(counts)): current_thresholds, current_tables = \ _collect_contingency_tables(values, counts[i]) if self.method == "log_odds": func = (lambda x: odds_ratio(x, alpha=0.1)) elif self.method == 'information_gain': func = information_gain elif self.method == 'bns': func = bns else: raise ValueError("Wrong binarization method: {0}".format(self.method)) scores = [func(table) for table in current_tables] best_score_index = np.argmax(scores) best_thresholds[i] = current_thresholds[best_score_index] best_scores[i] = scores[best_score_index] return best_thresholds, best_scores
train.isnull().sum() sns.distplot(train['Upvotes']); plt.show() train = train.drop(train[train.Views > 3000000].index) labelencoder_X = LabelEncoder() train['Tag'] = labelencoder_X.fit_transform(train['Tag']) train.drop(['ID','Username'], axis=1,inplace =True) target = train['Upvotes'] from sklearn.preprocessing import Binarizer bn = Binarizer(threshold=7) pd_watched = bn.transform([train['Answers']])[0] train['pd_watched'] = pd_watched train.head() feature_names = [x for x in train.columns if x not in ['Upvotes']] x_train, x_val, y_train, y_val = train_test_split(train[feature_names], target,test_size = 0.22,random_state =205) sc_X = StandardScaler() x_train = sc_X.fit_transform(x_train) x_val = sc_X.transform(x_val) poly_reg = PolynomialFeatures(degree = 4,interaction_only=False, include_bias=True) X_poly = poly_reg.fit_transform(x_train) poly_reg.fit(x_train, y_train)
from sklearn.kernel_approximation import RBFSampler, Nystroem from sklearn.cluster import FeatureAgglomeration from sklearn.feature_selection import SelectFwe, SelectKBest, SelectPercentile, VarianceThreshold from sklearn.feature_selection import SelectFromModel, RFE from sklearn.ensemble import ExtraTreesClassifier from xgboost import XGBClassifier from sklearn.model_selection import cross_val_predict from sklearn.metrics import accuracy_score, f1_score from tpot_metrics import balanced_accuracy_score from sklearn.pipeline import make_pipeline import itertools dataset = sys.argv[1] preprocessor_list = [Binarizer(), MaxAbsScaler(), MinMaxScaler(), Normalizer(), PolynomialFeatures(), RobustScaler(), StandardScaler(), FastICA(), PCA(), RBFSampler(), Nystroem(), FeatureAgglomeration(), SelectFwe(), SelectKBest(), SelectPercentile(), VarianceThreshold(), SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=100)), RFE(estimator=ExtraTreesClassifier(n_estimators=100))] # Read the data set into memory input_data = pd.read_csv(dataset, compression='gzip', sep='\t').sample(frac=1., replace=False, random_state=42) with warnings.catch_warnings(): warnings.simplefilter('ignore') for (preprocessor, learning_rate, n_estimators, max_depth) in itertools.product( preprocessor_list, [0.01, 0.1, 0.5, 1.0, 10.0, 50.0, 100.0],
from sklearn.cluster import FeatureAgglomeration from sklearn.feature_selection import SelectFwe, SelectKBest, SelectPercentile, VarianceThreshold from sklearn.feature_selection import SelectFromModel, RFE from sklearn.ensemble import ExtraTreesClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import cross_val_predict from sklearn.metrics import accuracy_score, f1_score from tpot_metrics import balanced_accuracy_score from sklearn.pipeline import make_pipeline import itertools dataset = sys.argv[1] preprocessor_list = [ Binarizer(), MaxAbsScaler(), MinMaxScaler(), Normalizer(), PolynomialFeatures(), RobustScaler(), StandardScaler(), FastICA(), PCA(), RBFSampler(), Nystroem(), FeatureAgglomeration(), SelectFwe(), SelectKBest(), SelectPercentile(), VarianceThreshold(),
# print('loaded_text', loaded_text) X, y = np.hsplit(loaded_text, [-1]) y = y.flatten() return X, y def get_column_names(path): with open(path) as fp: header = fp.readline().split(',')#[1:-1] return header X, y = get_training_data(data_path) letter_names = X[:, 0].reshape(-1, 1) letter_sounds = X[:, 1].reshape(-1, 1) # Binarize labels y = Binarizer(threshold=fail_threshold).transform(y.reshape(1, -1))[0] reading_data = (X, y) datasets = [ reading_data ] # points where we want ticks, as well as the label for that tick ticks = [ [0, 0], [13, 7], [26, 13], [39, 20], [52, 26] ]
from sklearn.neural_network import MLPClassifier Data_reader = csv_reader.CsvReader('../DataSet') Data_writer = csv_reader.CsvReader('../output') if __name__ == '__main__': train_data = Data_reader.read_data('train.csv') test_data = Data_reader.read_data('test.csv') # X_train = train_data.iloc[:, 1:] # Y_train = train_data.iloc[:, 0] # X_test = test_data train_data_bin = Binarizer(threshold=127).fit_transform(train_data) X_test_bin = Binarizer(threshold=127).fit_transform(test_data) X_train = pd.DataFrame(train_data_bin).iloc[:, 1:] Y_train = train_data.iloc[:, 0] X_test = pd.DataFrame(X_test_bin) train_images, vali_images, train_labels, vali_labels = \ train_test_split(X_train, Y_train, train_size=0.95,random_state=1) print('start predict') predict = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, ), random_state=5) predict.fit(train_images, train_labels)
# In[3]: # Import csv data raw_data = pd.read_csv('OnlineNewsPopularity_wLabels_deleteNoise.csv').iloc[:, 1:] # read in csv, omit the first column of url raw_data = raw_data.iloc[:, :-1] news_data = raw_data.iloc[:, :-1] # Take up to the second last column news_labels = raw_data.iloc[:, -1] # Take shares column for labels # Binarize print '\nBinary Threshold:' binary_threshold = np.median(raw_data[' shares']) news_data = news_data.drop(' n_non_stop_words', 1) print binary_threshold binarizer = Binarizer(threshold=binary_threshold) y_binary = binarizer.transform(news_labels).transpose().ravel() # In[ ]: # Discretize # In[25]: # Decision Tree from sklearn.tree import DecisionTreeClassifier tree = DecisionTreeClassifier() print 'Decision Tree Classifier Accuracy Rate' tree_score = cross_val_score(tree, news_data, y_binary, cv=10)
import deepdish as dd from bokeh.plotting import figure from bokeh.io import export_svgs machine = RBM(32 * 32, 600, 100, (32, 32), 32, 'cd') machine.from_saved_model( '/cluster/home/fdangelo/Restricted-Boltzmann-Machines/logs/scalars/1604-004823' ) datah5 = dd.io.load( '/Users/fdangelo/PycharmProjects/myRBM/data/ising/ising_data_complete.hdf5' ) data_bin = {} datah5_norm = {} #Take spin up as standard configuration keys = list(datah5.keys()) binarizer = Binarizer(threshold=0) for key in keys: datah5_norm[key] = np.array( [np.where(np.sum(slice) < 0, -slice, slice) for slice in datah5[key]]) data_bin[key] = np.array([ binarizer.fit_transform(slice) for slice in datah5_norm[key] ]).reshape(datah5_norm[key].shape[0], -1).astype(np.float32) magn_key_mean = [] for key in keys: print(data_bin[key].shape[0]) magn_23 = np.array( [np.mean(data_bin[key][i]) for i in range(data_bin[key].shape[0])]) magn_key_mean.append(np.mean(magn_23)) steps = [200, 1000, 10000, 100000, 1000000]
# binarization from sklearn.preprocessing import Binarizer import pandas import numpy url = "https://goo.gl/vhm1eU" names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] dataframe = pandas.read_csv(url, names=names) array = dataframe.values # separate array into input and output components X = array[:,0:8] Y = array[:,8] binarizer = Binarizer(threshold=0.0).fit(X) binaryX = binarizer.transform(X) # summarize transformed data numpy.set_printoptions(precision=3) print(binaryX[0:5,:])
try: hub_ego=nx.ego_graph(Gsim,head_node,radius = 1) # step 1 only except NameError: head_node = each[0] hub_ego=nx.ego_graph(Gsim,head_node,radius = 1) index = hub_ego.nodes() # pec = random.uniform(0.5,0.8) # percentage of nodes selected between [0.5,0.8] pec = 0.8 random.shuffle(index) subidx = index[:int(pec*len(index))] Y = np.zeros(num_sample) Y[::5] += 3 * (0.5-np.random.rand(num_sample/5)) # add noise to targets for each in subidx[1:]: Y += np.power(data_mat[:,each],3) binarizer = Binarizer() label = binarizer.transform(Y) # output the gene expression matrix ofp = open('nonlinear2.'+str(i)+'.genemat','w') for each in sorted(Gsim.nodes()): print >> ofp, str(each)+'\t'+'\t'.join(map(str,data_mat[:,each])) print >> ofp, 'outcome\t'+'\t'.join(map(str,label)) ofp.close() #print 'significant network',index nx.write_adjlist(Gsim,"nonlinear2."+str(i)+".adjlist") os.system('epd_python svmnet.py -n nonlinear2.'+str(i)+'.adjlist -g nonlinear2.'+str(i)+'.genemat -o nonlinear2.svm.'+str(i)+'.txt -s 0') # os.system('epd_python ../rfnet.py -n nonlinear2.adjlist -g nonlinear2.genemat -o nonlinear2.rf.txt -s 0 -r 20') os.system('epd_python knnnet.py -n nonlinear2.'+str(i)+'.adjlist -g nonlinear2.'+str(i)+'.genemat -o nonlinear2.knn.'+str(i)+'.txt -s 0') svm_count += count_net('nonlinear2.svm.'+str(i)+'.txt',index) #rf_count += count_net('nonlinear2.rf.txt',index)
_, n_features = X.get_shape() print('Loading test data...') with open('data/test-svmlight.dat') as infile: lines = infile.readlines() n_samples = len(lines) test = lil_matrix((n_samples, n_features)) for n,line in enumerate(lines): for word_count in line.split(): fid, count = word_count.split(':') test[n,int(fid)] = int(fid) test = test.tocsr() if opts.binarize: print('Binarizing the data...') binar = Binarizer(copy=False) X = binar.transform(X) test = binar.transform(test) if opts.tfidf: print('Transforming word occurrences into TF-IDF...') tranny = TfidfTransformer() X = tranny.fit_transform(X) test = tranny.transform(test) if opts.select_features: k_features = int(opts.k_features) if opts.select_features == 'k-best': print('Selecting %i best features...' % k_features) ch2 = SelectKBest(chi2, k=k_features) if opts.select_features == 'pct':
import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Binarizer from tpot.builtins import DatasetSelector # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=71) # Average CV score on the training set was:0.7526956521739131 exported_pipeline = make_pipeline( DatasetSelector(sel_subset=4, subset_list="module23.csv"), Binarizer(threshold=0.30000000000000004), RandomForestClassifier(bootstrap=True, criterion="entropy", max_features=0.25, min_samples_leaf=5, min_samples_split=4, n_estimators=100)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
except NameError: head_node = each[0] hub_ego = nx.ego_graph(Gsim, head_node, radius=1) index = shuffle(hub_ego.nodes(), random_state=random_state) head_set = set([head_node]) subidx = set(index[:int(pec * len(index))]) if head == 'in': subidx |= head_set else: subidx -= head_set Y = np.zeros(num_sample) Y[::5] += 3 * (0.5 - np.random.rand(num_sample / 5) ) # add noise to targets for each in subidx: Y += np.power(data_mat[:, each], 3) binarizer = Binarizer() label = binarizer.transform(Y) # output the gene expression matrix ofp = open('nonlinear.genemat', 'w') for each in sorted(Gsim.nodes()): print >> ofp, str(each) + '\t' + '\t'.join( map(str, data_mat[:, each])) print >> ofp, 'outcome\t' + '\t'.join(map(str, label)) ofp.close() # output the network adjlist file nx.write_adjlist(Gsim, "nonlinear.adjlist") os.system( 'epd_python egonet.py -n nonlinear.adjlist -g nonlinear.genemat -o nonlinear.egonet.txt -s 0'
import numpy as np import pandas as pd from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Binarizer # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=None) # Average CV score on the training set was:0.518333333333 exported_pipeline = make_pipeline( Binarizer(threshold=0.45), GradientBoostingClassifier(learning_rate=1.0, max_depth=7, max_features=0.55, min_samples_leaf=1, min_samples_split=13, n_estimators=100, subsample=0.35) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def solucao_casao(): verilog = open('pla_all_training.v', 'w') i, o, p = read_iop("PLA_dump/pla_cifar10_chunk_500_train.pla") ts = tt2df("PLA_dump/pla_cifar10_chunk_500_train.pla") vs = tt2df("PLA_dump/pla_cifar10_chunk_499_valid.pla") print(ts.iloc[:, -10:].values) binarizer = Binarizer(threshold=0.0).fit(ts) binary_train = binarizer.transform(ts) binarizer = Binarizer(threshold=0.0).fit(vs) binary_valid = binarizer.transform(vs) print(type(ts)) print(type(binary_train)) ts = pd.DataFrame(binary_train) vs = pd.DataFrame(binary_valid) # Split features and the target variable X_train = ts.iloc[:, :-10].values y_train = ts.iloc[:, -10:].values X_val = vs.iloc[:, :-10].values y_val = vs.iloc[:, -10:].values print(X_train.shape) print(y_train.shape) print(type(y_train[0])) print(type(y_train[0][0])) print(y_train[0][0]) print(y_train[1][0]) print(y_train[2][0]) print(y_train[3][0]) print(y_train) # Definition of the classifier clf = DecisionTreeClassifier( random_state=9856230, criterion='gini', max_depth=18, ) # Training and validation of the classifier clf.fit(X_train, y_train) y_predicted = clf.predict(X_val) y_predicted_training = clf.predict(X_train) # Generate a Verilog description of the classifier. # TODO: create a function out of this code snippet. n_nodes = clf.tree_.node_count children_left = clf.tree_.children_left children_right = clf.tree_.children_right feature = clf.tree_.feature threshold = clf.tree_.threshold # print("n_nodes: %s" % str(n_nodes)) # print("children_left: %s" % str(children_left)) # print("children_right: %s" % str(children_right)) # print("feature : %s" % str(feature)) # print("threshold : %s" % str(threshold)) node_depth = np.zeros(shape=n_nodes, dtype=np.int64) is_leaves = np.zeros(shape=n_nodes, dtype=bool) stack = [(0, 0)] while len(stack) > 0: node_id, depth = stack.pop() node_depth[node_id] = depth is_split_node = children_left[node_id] != children_right[node_id] if is_split_node: stack.append((children_left[node_id], depth + 1)) stack.append((children_right[node_id], depth + 1)) else: is_leaves[node_id] = True verilog.write('module top (') verilog.write('\t' + ', '.join(['x{}'.format(v) for v in np.arange(0, i)]) + ', ') # verilog.write('\t' + ', '.join(['y{}'.format(v) for v in np.arange(0, o)]) + 'y);\n') verilog.write(', '.join(['y{}'.format(v) for v in np.arange(0, o)]) + ');\n') # verilog.write('\t' + 'y);\n') verilog.write('input ' + ', '.join(['x{}'.format(v) for v in np.arange(0, i)]) + ';\n') verilog.write('output ' + ', '.join(['y{}'.format(v) for v in np.arange(0, o)]) + ';\n') # verilog.write('output y;\n') verilog.write('wire ' + ', '.join(['n{}'.format(v) for v in np.arange(0, n_nodes)]) + ';\n') for i in range(n_nodes): if is_leaves[i]: verilog.write('assign n{node} = {out_class};\n'.format( node=i, out_class=np.argmax(clf.tree_.value[i]))) else: verilog.write( 'assign n{node} = x{feature} ^ 1\'b1 ? n{left} : n{right};\n'. format(node=i, feature=feature[i], left=children_left[i], right=children_right[i])) verilog.write('assign y = n0;\n') verilog.write('endmodule') verilog.close() with open("tree_test.tree", "w") as arquivo: arquivo.write(tree.export_text(clf, max_depth=1000))
def test_fit_transform(): X = np.random.random((5, 4)) for obj in ((Scaler(), Normalizer(), Binarizer())): X_transformed = obj.fit(X).transform(X) X_transformed2 = obj.fit_transform(X) assert_array_equal(X_transformed, X_transformed2)
with tf.name_scope('stddev'): stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) tf.summary.scalar('stddev', stddev, step) tf.summary.scalar('max', tf.reduce_max(var), step) tf.summary.scalar('min', tf.reduce_min(var), step) tf.summary.histogram('histogram', var, step = step) if __name__ == '__main__': #Load and preprocess data here = os.path.dirname(os.path.abspath(__file__)) datah5 = dd.io.load('/cluster/scratch/fdangelo/RBM/data/ising/ising_data_L32_large.hdf5') # Transform -1 in 0 and take spin up as standard configuration binarizer = Binarizer(threshold=0) keys = list(datah5.keys()) # put here the temperature from keys that you want to use for the training #class_names = [keys[i] for i in [4, 6, 7, 8, 9, 10, 11, 12, 16]] class_names = [keys[i] for i in [6,8,9,10,11,12,16]] n_samples = datah5[keys[0]].shape[0] datah5_norm = {} data_bin = {} for key in class_names: if key==keys[9]: datah5_norm[key] = np.array([np.where(np.sum(slice)<0,-slice,slice) for slice in datah5[key]]) data_bin[key] = np.array([binarizer.fit_transform(slice) for slice in datah5_norm[key]]) else: #datah5_norm[key] = np.array([np.where(np.sum(slice)<0,-slice,slice) for slice in datah5[key]]) data_bin[key] = np.array([binarizer.fit_transform(slice) for slice in datah5[key]])
def testPreProc(): iris = load_iris() # 无量纲化使不同规格的数据转换到同一规格。常见的无量纲化方法有标准化和区间缩放法。 # 标准化是依照特征矩阵的列处理数据,其通过求z-score的方法,将样本的特征值转换到同一量纲下。 # 归一化是依照特征矩阵的行处理数据,其目的在于样本向量在点乘运算或其他核函数计算相似性时, # 拥有统一的标准,也就是说都转化为“单位向量”。 from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler MinMaxScaler().fit_transform(iris.data) StandardScaler().fit_transform(iris.data) # 二值化,阈值设置为3,返回值为二值化后的数据 from sklearn.preprocessing import Binarizer Binarizer(threshold=3).fit_transform(iris.data) # 哑编码,对IRIS数据集的目标值,返回值为哑编码后的数据,注意是2D的 # OneHotEncoder(sparse = False).fit_transform( testdata[['age']] ) from sklearn.preprocessing import OneHotEncoder OneHotEncoder().fit_transform(iris.target.reshape((-1, 1))) # 对于字符串型离散变量可以先用LabelEncoder 转换为数值再用OneHotEncoder编码 # 注意LabelEncoder是1D而OneHotEncoder是2D的 from sklearn.preprocessing import LabelEncoder LabelEncoder().fit_transform(iris.data[""]) # 缺失值计算,返回值为计算缺失值后的数据 # 参数missing_value为缺失值的表示形式,默认为NaN # 参数strategy为缺失值填充方式,默认为mean(均值) from numpy import vstack, array, nan from sklearn.preprocessing import Imputer Imputer().fit_transform(vstack((array([nan, nan, nan, nan]), iris.data))) # 数据变换 # 多项式变换 from sklearn.preprocessing import PolynomialFeatures # 多项式转换 #参数degree为度,默认值为 PolynomialFeatures().fit_transform(iris.data) # 自定义转换函数为对数函数的数据变换 #第一个参数是单变元函数 from numpy import log1p from sklearn.preprocessing import FunctionTransformer FunctionTransformer(log1p).fit_transform(iris.data) # 特征选择之filter # 方差选择法,返回值为特征选择后的数据 #参数threshold为方差的阈值 from sklearn.feature_selection import VarianceThreshold VarianceThreshold(threshold=3).fit_transform(iris.data) # 选择K个最好的特征,返回选择特征后的数据 # 第一个参数为计算评估特征是否好的函数,该函数输入特征矩阵和目标向量, # 输出二元组(评分,P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算相关系数 # 第二个参数k为选择的特征个数 from sklearn.feature_selection import SelectKBest from scipy.stats import pearsonr # 评价函数为 pearsonr 相关系数 SelectKBest(lambda X, Y: array(map(lambda x: pearsonr(x, Y), X.T)).T, k=2).fit_transform(iris.data, iris.target) # 评价函数为 卡方检验函数 from sklearn.feature_selection import chi2 SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target) # 互信息法 # from minepy import MINE # # 由于MINE的设计不是函数式的,定义mic方法将其为函数式的,返回一个二元组,二元组的第2项设置成固定的P值0.5 # def mic(x, y): # m = MINE() # m.compute_score(x, y) # return (m.mic(), 0.5) # SelectKBest(lambda X, Y: array(map(lambda x: mic(x, Y), X.T)).T, k=2).fit_transform(iris.data, iris.target) # 特征选择之wrapper # 递归特征消除法,返回特征选择后的数据 # 参数estimator为基模型 # 参数n_features_to_select为选择的特征个数 from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(iris.data, iris.target) # 特征选择之embedded # 使用带惩罚项的基模型,除了筛选出特征外,同时也进行了降维。 # 使用feature_selection库的SelectFromModel类结合带L1惩罚项的逻辑回归模型,来选择特征: from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LogisticRegression # 带L1惩罚项的逻辑回归作为基模型的特征选择 SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform( iris.data, iris.target) # L1惩罚项降维的原理在于保留多个对目标值具有同等相关性的特征中的一个,所以没选到的特征不代表不重要。 # 故可结合L2惩罚项来优化。若一个特征在L1中的权值为1,选择在L2中权值差别不大且在L1中权值为0的特征构成同类集合, # 将这一集合中的特征平分L1中的权值 # GBDT作为基模型的特征选择 from sklearn.ensemble import GradientBoostingClassifier SelectFromModel(GradientBoostingClassifier()).fit_transform( iris.data, iris.target)
#导入数据 from sklearn.datasets import load_iris iris = load_iris() iris.data iris.target #数据预处理 #数据标准化 from sklearn.preprocessing import StandardScaler StandardScaler().fit_transform(iris.data) #数据区间化、归一化 from sklearn.preprocessing import MinMaxScaler MinMaxScaler().fit_transform(iris.data) #对定量特征二值化 from sklearn.preprocessing import Binarizer Binarizer(threshold=3).fit_transform(iris.data) #对定性特征哑编码 #When a qualitative predictor has more than two levels, a single dummy #variable cannot represent all possible values. In this situation, we can create #additional dummy variables. For example, for the ethnicity variable we #create two dummy variables. The first could be equation 3.28 in AN INTRODUCTION TO STATISTICAL LEARNING; #and the second could be equation 3.29 in AN INTRODUCTION TO STATISTICAL LEARNING from sklearn.preprocessing import OneHotEncoder OneHotEncoder().fit_transform(iris.target.reshape((-1,1))) #数据缺失处理 from sklearn.preprocessing import Imputer from numpy import vstack, array, nan #缺失值计算,返回值为计算缺失值后的数据 #参数missing_value为缺失值的表示形式,默认为NaN
FunctionTransformer, KBinsDiscretizer, MinMaxScaler, Normalizer, OneHotEncoder, OrdinalEncoder, PolynomialFeatures, PowerTransformer, StandardScaler, ) from feature_engine.selection import DropFeatures from feature_engine.wrappers import SklearnTransformerWrapper _transformers = [ Binarizer(threshold=2), KBinsDiscretizer(n_bins=3, encode="ordinal"), StandardScaler(), MinMaxScaler(), Normalizer(), PowerTransformer(), FunctionTransformer(np.log, validate=True), OrdinalEncoder(), ] _selectors = [ SelectFromModel(Lasso(random_state=1)), SelectKBest(f_regression, k=2), VarianceThreshold(), RFE(Lasso(random_state=1)), ]
# In[ ]: model = build_neural_network() restorer = tf.train.Saver() with tf.Session() as sess: restorer.restore(sess, "./titanic.ckpt") feed = {model.inputs: test_data, model.is_training: False} test_predict = sess.run(model.predicted, feed_dict=feed) test_predict[:10] # In[ ]: from sklearn.preprocessing import Binarizer binarizer = Binarizer(0.5) test_predict_result = binarizer.fit_transform(test_predict) test_predict_result = test_predict_result.astype(np.int32) test_predict_result[:10] # In[ ]: passenger_id = test_passenger_id.copy() evaluation = passenger_id.to_frame() evaluation["Survived"] = test_predict_result evaluation[:10] # In[ ]: evaluation.to_csv("evaluation_submission.csv", index=False)
def test_binarizer(): X_ = np.array([[1, 0, 5], [2, 3, 0]]) for init in (np.array, sp.csr_matrix): X = init(X_.copy()) binarizer = Binarizer(threshold=2.0, copy=True) X_bin = toarray(binarizer.transform(X)) assert_equal(np.sum(X_bin == 0), 4) assert_equal(np.sum(X_bin == 1), 2) binarizer = Binarizer(copy=True).fit(X) X_bin = toarray(binarizer.transform(X)) assert_true(X_bin is not X) assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4) binarizer = Binarizer(copy=True) X_bin = binarizer.transform(X) assert_true(X_bin is not X) X_bin = toarray(X_bin) assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4) binarizer = Binarizer(copy=False) X_bin = binarizer.transform(X) assert_true(X_bin is X) X_bin = toarray(X_bin) assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4)
import pandas as pd import matplotlib.pyplot as plt eider.s3.download('s3://eider-datasets/mlu/Book_Ratings.csv', '/tmp/Book_Ratings.csv') data = pd.read_csv('tmp/Book_Ratings.csv', dtype={ 'User': str, 'ASIN': str, 'Rating': np.int }) train, test = train_test_split(data, random_state=8675309, stratify=data['ASIN']) binarizer = Binarizer(threshold=0, copy=True) S_train = train.pivot_table(index='User', columns='ASIN', values='Rating', fill_value=0).as_matrix() R_train = binarizer.fit_transform(S_train) S_test = test.pivot_table(index='User', columns='ASIN', values='Rating', fill_value=0).as_matrix() R_test = binarizer.fit_transform(S_test) uniqueUsers = data['User'].unique().tolist() uniqueASINs = data['ASIN'].unique().tolist() numUser = len(uniqueUsers)
x_all = np.delete(data_train, 0, 1) #get first column #%% Pre-processing on X training data #note that these are four methods we tried to normalize the data #only a single methods was used at a time # (1) normalize data (TFIDF) tfdif = TfidfTransformer(norm='l1') x_tfdif = tfdif.fit_transform(x_all) # (2) log normalization x_log = np.log(1 + x_all) # (3) binary normalization (convert all non-zero entries to 1) binar = Binarizer() x_bin = binar.fit_transform(x_all) # (4) normalize w.r.t each feature normal = Normalizer(norm='l2') x_normal = normal.fit_transform(x_all) #generate a test-train split for validation (does not mean cross-validation) # note that the random state is set for exact recall x_train0, x_test0, y_train0, y_test0 = train_test_split(x_all, y_all, test_size=0.05, random_state=0) #%% Utility function to report best scores
# # Binarization # In[6]: watched = np.array(popsong_df['listen_count']) watched[watched >= 1] = 1 popsong_df['watched'] = watched popsong_df.head(10) # In[7]: from sklearn.preprocessing import Binarizer bn = Binarizer(threshold=0.9) pd_watched = bn.transform([popsong_df['listen_count']])[0] popsong_df['pd_watched'] = pd_watched popsong_df.head(11) # # Rounding # In[8]: items_popularity = pd.read_csv('datasets/item_popularity.csv', encoding='utf-8') items_popularity # In[9]:
from Models import InteractionFeatures, Model, Bounder, RemoveDuplicateCols, ReturnSame, f1, lad from sklearn.cross_validation import StratifiedKFold from sklearn.grid_search import GridSearchCV from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor #%% os.chdir(workspace) logging.config.fileConfig('loggerConfig.properties') logger = logging.getLogger('alllog') logger.debug("Starting...") binarizer = Binarizer(copy=True, threshold=thresh) featureunion1 = FeatureUnion([ #('duplicater',ReturnSame()), ('if+',InteractionFeatures(method = lambda x,y:(x+y), threshold = corr_thresh,subsample = 1,logger=logger)), ('if-',InteractionFeatures(method = lambda x,y:(x-y), threshold = corr_thresh,subsample = 1,logger=logger)), ('if*',InteractionFeatures(method = lambda x,y:(x*y), threshold = corr_thresh,subsample = 1,logger=logger)), ('if/',InteractionFeatures(method = lambda x,y:(x/y), threshold = corr_thresh,subsample = 1,logger=logger)), ('if|',InteractionFeatures(method = lambda x,y:(y/x), threshold = corr_thresh,subsample = 1,logger=logger)) ]) pp_pipeline = Pipeline([ ('removedupes',RemoveDuplicateCols(logger=logger)), ('featureextraction',featureunion1), ('bounder',Bounder(inf,-inf)) ])
def binarize(img, threshold): binarizer = Binarizer(threshold, copy=False) return binarizer.fit_transform(img)
def load(opt='custom', x_filename=None, y_filename=None, n_samples=0, samples_on='rows', **kwargs): """Load a specified dataset. This function can be used either to load one of the standard scikit-learn datasets or a different dataset saved as X.npy Y.npy in the working directory. Parameters ----------- opt : {'iris', 'digits', 'diabetes', 'boston', 'circles', 'moons', 'custom', 'GSEXXXXX'}, default: 'custom' Name of a predefined dataset to be loaded. 'iris', 'digits', 'diabetes' 'boston', 'circles' and 'moons' refer to the correspondent `scikit-learn` datasets. 'custom' can be used to load a custom dataset which name is specified in `x_filename` and `y_filename` (optional). x_filename : string, default : None The data matrix file name. y_filename : string, default : None The label vector file name. n_samples : int The number of samples to be loaded. This comes handy when dealing with large datasets. When n_samples is less than the actual size of the dataset this function performs a random subsampling that is stratified w.r.t. the labels (if provided). samples_on : string This can be either in ['row', 'rows'] if the samples lie on the row of the input data matrix, or viceversa in ['col', 'cols'] the other way around. data_sep : string The data separator. For instance comma, tab, blank space, etc. Returns ----------- X : array of float, shape : n_samples x n_features The input data matrix. y : array of float, shape : n_samples The label vector; np.nan if missing. feature_names : array of integers (or strings), shape : n_features The feature names; a range of number if missing. index : list of integers (or strings) This is the samples identifier, if provided as first column (or row) of of the input file. Otherwise it is just an incremental range of size n_samples. """ data = None try: if opt.lower() == 'iris': data = datasets.load_iris() elif opt.lower() == 'digits': data = datasets.load_digits() elif opt.lower() == 'diabetes': data = datasets.load_diabetes() b = Binarizer(threshold=np.mean(data.target)) data.target = b.fit_transform(data.data) elif opt.lower() == 'boston': data = datasets.load_boston() b = Binarizer(threshold=np.mean(data.target)) data.target = b.fit_transform(data.data) elif opt.lower() == 'gauss': means = np.array([[-1, 1, 1, 1], [0, -1, 0, 0], [1, 1, -1, -1]]) sigmas = np.array([0.33, 0.33, 0.33]) if n_samples <= 1: n_samples = 333 xx, yy = generate_gauss(mu=means, std=sigmas, n_sample=n_samples) data = datasets.base.Bunch(data=xx, target=yy) elif opt.lower() == 'circles': if n_samples == 0: n_samples = 400 xx, yy = datasets.make_circles(n_samples=n_samples, factor=.3, noise=.05) data = datasets.base.Bunch(data=xx, target=yy) elif opt.lower() == 'moons': if n_samples == 0: n_samples = 400 xx, yy = datasets.make_moons(n_samples=n_samples, noise=.01) data = datasets.base.Bunch(data=xx, target=yy) elif opt.lower() == 'custom': data = load_custom(x_filename, y_filename, samples_on, **kwargs) elif opt.lower().startswith('gse'): raise Exception("Use ade_GEO2csv.py to convert GEO DataSets" "into csv files.") except IOError as e: print("I/O error({0}): {1}".format(e.errno, e.strerror)) X, y = data.data, data.target if n_samples > 0 and X.shape[0] > n_samples: if y is not None: try: # Legacy for sklearn sss = StratifiedShuffleSplit(y, test_size=n_samples, n_iter=1) # idx = np.random.permutation(X.shape[0])[:n_samples] except TypeError: sss = StratifiedShuffleSplit(test_size=n_samples) \ .split(X, y) _, idx = list(sss)[0] else: idx = np.arange(X.shape[0]) np.random.shuffle(idx) idx = idx[:n_samples] X, y = X[idx, :], y[idx] else: # The length of index must be consistent with the number of samples idx = np.arange(X.shape[0]) feat_names = data.feature_names if hasattr(data, 'feature_names') \ else np.arange(X.shape[1]) index = np.array(data.index)[idx] if hasattr(data, 'index') \ else np.arange(X.shape[0]) return X, y, feat_names, index
from sklearn.preprocessing import Binarizer, LabelEncoder, OneHotEncoder onehot_encoder = OneHotEncoder() label_encoder = LabelEncoder() x = ['a', 'b', 'c'] label_x = label_encoder.fit_transform(x).reshape([len(x), 1]) print(label_x) print(onehot_encoder.fit_transform(label_x).toarray()) binarizer = Binarizer(threshold=1.0).fit(label_x) print(binarizer.transform(label_x))
print(X_.shape) # 一次交互项:x1, x2, x3, x4 # 二次交互项: x1*x2, x1*x3, x1*x4, x2*x3, x2*x4, x3*x4 # 三次交互项:x1*x2*x3, x1*x2*x4, x1*x3*x4, x2*x3*x4 ###################################################################### ######## Part4. 类型转换(离散化、数字化、哑变量化) ###################################################################### ### 二值离散化{0,1} # 大于threshold都标记为1,小于等于threshold的都标记为0. from sklearn.preprocessing import Binarizer cols = ['年龄', '收入'] est = Binarizer(threshold=50) X_ = est.fit_transform(df[cols]) print(X_) ### 多值离散化,分箱数n_bins from sklearn.preprocessing import KBinsDiscretizer est = KBinsDiscretizer(n_bins=5, encode='ordinal') #0~n_bins-1 X_ = est.fit_transform(df[cols]) print(X_) # KBinsDiscretizer # (n_bins=5, encode=’onehot’, strategy=’quantile’) #n_bins : int or array-like, shape (n_features,) (default=5) # 分箱数,不能小于2 # encode : {‘onehot’, ‘onehot-dense’, ‘ordinal’}, (default=’onehot’)
def by_threshold(self, threshold=0.0): bin = Skbin(threshold).fit(self.M) return bin.transform(self.M)
# Show data print(houses) print("\n") print("\n") print("\n") #4.8 Discretizating Features # Load libraries import numpy as np from sklearn.preprocessing import Binarizer # Create feature age = np.array([[6], [12], [20], [36], [65]]) # Create binarizer binarizer = Binarizer(18) # Transform feature binarizer.fit_transform(age) # Bin feature np.digitize(age, bins=[20, 30, 64]) # Bin feature np.digitize(age, bins=[20, 30, 64], right=True) # Bin feature np.digitize(age, bins=[18]) #4.9 Grouping Observations Using Clustering # Load libraries import pandas as pd from sklearn.datasets import make_blobs from sklearn.cluster import KMeans
def random_forest_regressor(X, y, threshold, value, k_fold=5): X = np.asarray(X) y = np.asarray(y) y[y == 'NOISE'] = 'a' y_unique = np.unique(y) #enc = ColumnTransformer([("noise", OneHotEncoder(sparse = False, handle_unknown = 'error'), [0])], remainder = 'passthrough') #cat = [['noise', 'dusky', 'ratufa']] enc = OneHotEncoder(categories='auto', sparse=False, handle_unknown='error') y_regressor = enc.fit_transform(y.reshape(y.shape[0], 1)) # dividing X, y into train and test data sss = StratifiedShuffleSplit(n_splits=k_fold, test_size=0.2, random_state=0) # Do K fold cross validation all_cms = [] all_accuracies = [] tp_array = [] fp_array = [] tn_array = [] fn_array = [] print('Doing {} fold cross validation predictions. Classes: {}'.format( k_fold, np.unique(y))) for k, (train_index, test_index) in enumerate(sss.split(X, y_regressor)): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y_regressor[train_index], y_regressor[test_index] y_test_cat = enc.inverse_transform(y_test) # training a classifier clf = RandomForestRegressor(random_state=0, n_estimators=100) clf.fit(X_train, y_train) predictions = clf.predict(X_test) print(predictions[0:10]) predictions = Binarizer(threshold=threshold).fit_transform(predictions) print(predictions[0:10]) predictions_cat = enc.inverse_transform(predictions) print(predictions_cat[0:10]) y_test_cat[y_test_cat == 'usky'] = 'dusky' predictions_cat[predictions_cat == 'usky'] = 'dusky' # model accuracy for X_test class_scores = f1_score(y_test_cat, predictions_cat, average=None) print('{}/{} folds mean accuracy: {}'.format(k + 1, k_fold, np.mean(class_scores))) all_accuracies.append(class_scores) cm_labels = np.unique(y) k_cm = confusion_matrix(y_test_cat, predictions_cat, labels=cm_labels) FP = k_cm.sum(axis=0) - np.diag(k_cm) FN = k_cm.sum(axis=1) - np.diag(k_cm) TP = np.diag(k_cm) TN = k_cm.sum().sum() - (FP + FN + TP) tp_array.append(TP) fp_array.append(FP) tn_array.append(TN) fn_array.append(FN) all_cms.append(k_cm) # Get averages across K fold cross validation final_tp = np.mean(np.asarray(tp_array), axis=0) final_tn = np.mean(np.asarray(tn_array), axis=0) final_fp = np.mean(np.asarray(fp_array), axis=0) final_fn = np.mean(np.asarray(fn_array), axis=0) cm_values = [final_tp, final_tn, final_fp, final_fn] accuracies = np.mean(np.asarray(all_accuracies), axis=0) average_accuracy = np.mean(accuracies) print('Average accuracy = {}'.format(average_accuracy)) cm = np.mean(np.asarray(all_cms), axis=0) return cm, cm_labels, average_accuracy, accuracies, cm_values
X = (news_data * lasso_est.transpose()) # multiply element wise with lasso estimate df_Lasso = X[X.columns[(X != 0).any()]] # remove columns where all elements are zero print df_Lasso.shape # number of columns should significantly shrink depending on choice of alpha df_Lasso.columns.values.tolist() # In[104]: #obtain a split # from sklearn.cross_validation import train_test_split # X_train, X_test, y_train, y_test = train_test_split(df_Lasso, news_labels) #binarize from sklearn.preprocessing import Binarizer binarizer = Binarizer(threshold=binary_threshold) binary_labels = binarizer.transform(news_labels).transpose().ravel() # .ravel() is to fix "Too many array indices error" print binary_labels.shape # In[107]: from sklearn.neighbors import KNeighborsClassifier from sklearn.cross_validation import cross_val_score knn = KNeighborsClassifier(n_neighbors=1) # arbitrary k cv = cross_val_score(knn, df_Lasso, binary_labels, cv=10) print "Cross Validation Scores" print cv print 'Mean Cross Validation Score' print np.mean(cv)
def binarizer(args): #https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer return Binarizer(threshold=args['threshold'], copy=True)
import numpy as np import pandas as pd from sklearn.linear_model import RidgeCV from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Binarizer from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: -36.33392193683913 exported_pipeline = make_pipeline( Binarizer(threshold=0.2), RidgeCV() ) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
tokenizer = tokenize(X_train) X_tokens = tokenizer.transform(X_train) # Train Recurrent Neural Network model = train_RNN(tokenizer, X_tokens, y_train) y_pred_tr = model.predict(X_tokens).flatten() # Check overall performance test_tokens = tokenizer.transform(X_test) y_pred_tst = model.predict(test_tokens).flatten() # Conver predictions to binary yhat_train = y_pred_tr.reshape(-1, 1) yhat_test = y_pred_tst.reshape(-1, 1) binarizer = Binarizer(threshold=0.5).fit(yhat_train) yhat_tr_b = binarizer.transform(yhat_train).astype(int) yhat_tst_b = binarizer.transform(yhat_test).astype(int) save(model, review_score_full.pkl) with open('review_tokenizer_full.pkl', 'wb') as fileObject: pickle.dump(tokenizer, fileObject) # # Save model for future use # save(model, 'review_scorer1.pkl') # # model = load('review_scorer.pkl') # with open('review_tokenizer1.pkl','wb') as fileObject: # pickle.dump(tokenizer, fileObject) # Scorers to consider
def fp_vectorizer(self, processed_data): binarizer = Binarizer(threshold = 5) vectorized_data = binarizer.fit_transform(processed_data) return vectorized_data