def norm_data(norm_num, X_train_or, X_test_or): if norm_num == 0: X_tr_norm = X_train_or X_ts_norm = X_test_or elif norm_num == 1: scaler = preprocessing.Normalizer().fit(X_train_or) X_tr_norm = scaler.transform(X_train_or) X_ts_norm = scaler.transform(X_test_or) elif norm_num == 2: scaler = preprocessing.StandardScaler().fit(X_train_or) X_tr_norm = scaler.transform(X_train_or) X_ts_norm = scaler.transform(X_test_or) elif norm_num == 3: scaler = preprocessing.MinMaxScaler().fit(X_train_or) X_tr_norm = scaler.transform(X_train_or) X_ts_norm = scaler.transform(X_test_or) elif norm_num == 4: scaler = preprocessing.MaxAbsScaler().fit(X_train_or) X_tr_norm = scaler.transform(X_train_or) X_ts_norm = scaler.transform(X_test_or) elif norm_num == 5: scaler1 = preprocessing.StandardScaler().fit(X_train_or) X_tr_norm1 = scaler1.transform(X_train_or) X_ts_norm1 = scaler1.transform(X_test_or) scaler2 = preprocessing.MinMaxScaler().fit(X_tr_norm1) X_tr_norm = scaler2.transform(X_tr_norm1) X_ts_norm = scaler2.transform(X_ts_norm1) elif norm_num == 6: scaler1 = preprocessing.StandardScaler().fit(X_train_or) X_tr_norm1 = scaler1.transform(X_train_or) X_ts_norm1 = scaler1.transform(X_test_or) scaler2 = preprocessing.MaxAbsScaler().fit(X_tr_norm1) X_tr_norm = scaler2.transform(X_tr_norm1) X_ts_norm = scaler2.transform(X_ts_norm1) return X_tr_norm, X_ts_norm
def init_pp(ppi, raw_data): # Initialize list of scaler objects if ppi['name'] == 'MinMax': pp = [preprocessing.MinMaxScaler(feature_range=(-1.0, 1.0)), # temp preprocessing.MinMaxScaler(feature_range=(-1.0, 1.0))] # humid. elif ppi['name'] == 'MaxAbs': pp = [preprocessing.MaxAbsScaler(), # for temperature preprocessing.MaxAbsScaler()] # and humidity elif ppi['name'] == 'StandardScaler': pp = [preprocessing.StandardScaler(), # for temperature preprocessing.StandardScaler()] # and humidity elif ppi['name'] == 'RobustScaler': pp = [preprocessing.RobustScaler(), # for temperature preprocessing.RobustScaler()] # and humidity elif ppi['name'] == 'SimpleY': pp = [10./1., 10./2.5] # for temperature else: ValueError('Incorrect scaler name') # Initialize scalers with data if ppi['method'] == 'individually': pp[0].fit(unpack(raw_data, 'T')) pp[1].fit(unpack(raw_data, 'q')) elif ppi['method'] == 'alltogether': pp[0].fit(np.reshape(unpack(raw_data, 'T'), (-1, 1))) pp[1].fit(np.reshape(unpack(raw_data, 'q'), (-1, 1))) elif ppi['method'] == 'qTindividually': if ppi['name'] != 'SimpleY': pp = pp[0] pp.fit(raw_data) else: raise ValueError('Incorrect scaler method') return pp
def loading_matrices(self): # user feature df_rf = pd.read_csv(self.data_dir + 'rf_all_more.csv') # movie feature df_cf = pd.read_csv(self.data_dir + 'cf_all_more.csv') print("Completed loading the data") print("df_rf.shape: ", df_rf.shape) print("df_cf.shape: ", df_cf.shape) print("#") org_u = df_rf.to_numpy() org_v = df_cf.to_numpy() print("orgU.shape: ", org_u.shape) print("orgV.shape: ", org_v.shape) print("#") u_scaler = preprocessing.MaxAbsScaler() self.U = u_scaler.fit_transform(org_u) v_scaler = preprocessing.MaxAbsScaler() self.V = v_scaler.fit_transform(org_v) print("U.shape: ", self.U.shape) print("V.shape: ", self.V.shape) print("#") print("U: ", np.min(self.U), ", ", np.max(self.U), ", ", np.median(self.U)) print("V: ", np.min(self.V), ", ", np.max(self.V), ", ", np.median(self.V)) print("#")
def convert_tfidf(conf): """ 提取特征: :param filename: :return: """ import jieba labels = ['人类作者', '自动摘要', '机器作者', '机器翻译'] label_dict = dict([(label, i) for i, label in enumerate(labels)]) print('start extract tfidf') texts = [] texts_cut = [] with open(conf.filename, encoding='utf-8') as fin: for line in fin: sample = json.loads(line.strip()) if conf.has_label: label = sample['标签'] else: label = labels[0] text = sample['内容'] if label in label_dict: texts.append(text) texts_cut.append(' '.join(jieba.cut(text))) gram_low = 1 gram_high = 6 max_feature = 10000 if conf.new_tfidf: vectorizer_char = TfidfVectorizer(encoding="utf8", analyzer='char', ngram_range=(gram_low, gram_high), max_features=max_feature) scaler = preprocessing.MaxAbsScaler() vectorizer_char.fit(texts) x = vectorizer_char.transform(texts) scaler.fit(x) x = scaler.transform(x) vectorizer_term = TfidfVectorizer(encoding="utf8", analyzer='word', ngram_range=(gram_low, gram_high), max_features=max_feature) scaler_term = preprocessing.MaxAbsScaler() vectorizer_term.fit(texts_cut) x_term = vectorizer_term.transform(texts_cut) scaler_term.fit(x_term) x_term = scaler_term.transform(x_term) pickle_dump((vectorizer_char, scaler, vectorizer_term, scaler_term), conf.tfidf_helper) else: vectorizer_char, scaler, vectorizer_term, scaler_term = pickle_load( conf.tfidf_helper) x = vectorizer_char.transform(texts) x = scaler.transform(x) x_term = vectorizer_term.transform(texts_cut) x_term = scaler_term.transform(x_term) pickle_dump((x, x_term), conf.tfidf_file)
def normaldata(xtrold,xteold): Xtrain = xtrold.reshape(60000,784).astype('float32') Xtest = xteold.reshape(10000,784).astype('float32') scaler = preprocessing.MaxAbsScaler().fit(Xtrain) scaler = preprocessing.MaxAbsScaler().fit(Xtest) xtrnew = scaler.transform(Xtrain) xtenew = scaler.transform(Xtest) #标准化 # xtrnew = Xtrain/255 # Xtenew = Xtest/255 # 缩放 效果是一样的 return xtrnew,xtenew
def preprocessing_DX_DSIFF(self, validation_percentage, name_to_save): #scaling and divition between validation and training sets for i in range(len(self.DX)): if i == 0: DXp = self.DX[i] Ft = self.ftot_stru[i] else: DXp = np.concatenate((DXp, self.DX[i]), axis=0) Ft = np.concatenate((Ft, self.ftot_stru[i]), axis=0) DXx = DXp[:, :, 0] DXy = DXp[:, :, 1] DXz = DXp[:, :, 2] Fx = Ft[:, 0] Fy = Ft[:, 1] Fz = Ft[:, 2] scaler = preprocessing.MaxAbsScaler() DXx_scaled = scaler.fit_transform(DXx) filename = '%s/scaler_Fx.sav' % name_to_save joblib.dump(scaler, filename) scaler = preprocessing.MaxAbsScaler() DXy_scaled = scaler.fit_transform(DXy) filename = '%s/scaler_Fy.sav' % name_to_save joblib.dump(scaler, filename) scaler = preprocessing.MaxAbsScaler() DXz_scaled = scaler.fit_transform(DXz) filename = '%s/scaler_Fz.sav' % name_to_save joblib.dump(scaler, filename) mixer = np.array(range(DXx_scaled.shape[0])) for _ in range(1000): np.random.shuffle(mixer) n = int(len(mixer) * (1.0 - validation_percentage / 100.0)) # marking the 90% self.DXx_trai = DXx_scaled[mixer[:n]] self.DXx_vali = DXx_scaled[mixer[n:]] #DXx x component of nabla(X) #DXx x(numb_of_atoms = numb_of_struc*numb_atoms_in_struc, numb_of_feaures) self.Fx_trai = Fx[mixer[:n]] self.Fx_vali = Fx[mixer[n:]] #Fx x component of force 1-d array #Fx (numb_of_atoms = numb_of_struc*numb_atoms_in_struc) self.DXy_trai = DXy_scaled[mixer[:n]] self.DXy_vali = DXy_scaled[mixer[n:]] self.Fy_trai = Fy[mixer[:n]] self.Fy_vali = Fy[mixer[n:]] self.DXz_trai = DXz_scaled[mixer[:n]] self.DXz_vali = DXz_scaled[mixer[n:]] self.Fz_trai = Fz[mixer[:n]] self.Fz_vali = Fz[mixer[n:]] return None
def train(self, verbose=0, sigma=0, seed=23, transform=False): """ Compiles the model, prints a summary, fits to data The boolean transform rescales the data if True (default), and uses raw data otherwise. The input sigma controls the noise for the train/val inputs """ # load data and targets Phi_train, theta_Phi_train = deepcopy(self.train_data) Phi_val, theta_Phi_val = deepcopy(self.val_data) # add noise Phi_train, train_noise = tools.add_noise(Phi_train, sigma, seed=2) Phi_val, val_noise = tools.add_noise(Phi_val, sigma, seed=3) self.transformed = transform if transform: # transform train and val inputs Phi_train_tformer = preprocessing.MaxAbsScaler() Phi_val_tformer = preprocessing.MaxAbsScaler() Phi_train = Phi_train_tformer.fit_transform(Phi_train) Phi_val = Phi_val_tformer.fit_transform(Phi_val) # transform train and val targets theta_Phi_train_tformer = preprocessing.MaxAbsScaler() theta_Phi_val_tformer = preprocessing.MaxAbsScaler() theta_Phi_train = theta_Phi_train_tformer.fit_transform( theta_Phi_train) theta_Phi_val = theta_Phi_val_tformer.fit_transform(theta_Phi_val) # compile and print summary set_seed(seed) self.build_model() self.model.summary() # make callbacks and fit model callbacks = self.get_callbacks() self.model.fit(x=Phi_train, y=theta_Phi_train, validation_data=(Phi_val, theta_Phi_val), batch_size=self.batch_size, epochs=self.epochs, callbacks=callbacks, verbose=verbose) print('test mse:', self.model.evaluate(self.test_data[0], self.test_data[1])) print('test thetas:', self.model.predict(self.test_data[0]))
def PreprocessData(self, Preprocess='AbsMax'): # default scaller is MaxAbs assert Preprocess in [ 'AbsMax', 'MinMax' ], "%r is not a registed preprocess method" % Preprocess try: print('The preprocess method is', Preprocess) except NameError: print('Please enter in the preprocess method, ') if Preprocess == 'AbsMax': scaler = preprocessing.MaxAbsScaler() if Preprocess == 'MinMax': scaler = preprocessing.MinMaxScaler() if self.IfSplitData: try: self.TRAIN_DATA_all = scaler.fit_transform(self.TRAIN_DATA_all) self.VAL_DATA_all = scaler.fit_transform(self.VAL_DATA_all) except NameError: print('The scaller haven' 't been defined, the data hasn' 't been processed') else: try: self.DATA_all = scaler.fit_transform(self.DATA_all) self.LABEL_all = scaler.fit_transform(self.LABEL_all) except NameError: print('The scaller haven' 't been defined, the data hasn' 't been processed') print('.\n..\n...\nPreprocess is done')
def preprocess(preprocesstype, var): #preprocesstype: selects preproccesing type for model, "MMS" for MinMaxScaler, "RS" for Robustscaler, "SS" for StandardScaler, "MAS" for MaxAbsScaler #var for varibale np.array is set to from sklearn import preprocessing if preprocesstype == "MMS": print("preprocessing is done with MinMaxScaler") X = preprocessing.StandardScaler() var = X.fit_transform(var) return var elif preprocesstype == "RS": print("preprocessing is done with RobustScaler") X = preprocessing.RobustScaler() var = X.fit_transform(var) return var elif preprocesstype == "SS": print("preprocessing is done with StandardScaler") X = preprocessing.StandardScaler() var = X.fit_transform(var) return var elif preprocesstype == "MAS": print("preprocessing is done with MaxAbsScaler") X = preprocessing.MaxAbsScaler() var = X.fit_transform(var) return var else: print("Preprocessing type not recognized")
def __init__(self): super().__init__() self.name = 'Max-abs Scaler' self.model = preprocessing.MaxAbsScaler() self.takes_label = False
def test_maxAbsScaler(self): data = np.random.normal(10, 3, size=100) data = np.array([data]).T maxabs_scaler = preprocessing.MaxAbsScaler() self.scaler2dict2scaler_test(maxabs_scaler, data)
def data_precess(train_data,yinzi = [], pre_style = 'max_min'): import sklearn.preprocessing as spp train_data.fillna(0, inplace=True) train_data = train_data[train_data['平均月收益'] != 0].copy() train_data0 = pd.DataFrame() # 数据处理 if pre_style == 'max_min': train_data0 = spp.MinMaxScaler().fit_transform(train_data[yinzi]) elif pre_style == 'max_abs': train_data0 = spp.MaxAbsScaler().fit_transform(train_data[yinzi]) elif pre_style == 'standar': train_data0 = spp.StandardScaler().fit_transform(train_data[yinzi]) elif pre_style == 'normal': train_data0 = spp.Normalizer().fit_transform(train_data[yinzi]) train_data0 = pd.DataFrame(train_data0, columns=yinzi, index=train_data.index) train_data0.loc[:, '预测周期真实收益'] = pd.Series(train_data['预测周期真实收益']) return train_data0
def news20(data_path=home+'/datasets/news20/news20.binary'): try: open(data_path, 'r') except FileNotFoundError as e: print(str(e)) print("Download news20.binary from https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html") return None, None, None, None X, Y = datasets.load_svmlight_file(data_path) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42) # normalization makes learning very slow (due to lack of sparsity) s = preprocessing.MaxAbsScaler() X_train = s.fit_transform(X_train) X_test = s.transform(X_test) X_train = preprocessing.normalize(X_train) X_test = preprocessing.normalize(X_test) X_train = X_train.toarray() X_test = X_test.toarray() return X_train, Y_train, X_test, Y_test
def normalX(trainFile): from sklearn.datasets import load_svmlight_file x_train, y_train = load_svmlight_file(trainFile) print(x_train.shape, y_train.shape) from collections import Counter Y_statis = sorted(Counter(y_train).items()) print(Y_statis, len(Y_statis)) json.dump(Y_statis, open(DATAPATH + '/Y_statis.txt', 'w+', encoding='utf-8'), ensure_ascii=False) from sklearn import preprocessing max_abs_scaler = preprocessing.MaxAbsScaler() x_train = max_abs_scaler.fit_transform(x_train) pickle.dump(max_abs_scaler, open(MODELPATH + '/MaxAbsScaler.pickle', 'wb'), -1) from sklearn.model_selection import train_test_split X_train, x_test, Y_train, y_test = train_test_split(x_train, y_train, test_size=0.2) from sklearn.datasets import dump_svmlight_file dump_svmlight_file(x_test, y_test, DATAPATH + '/Normal_valiation.libsvm') dump_svmlight_file(x_train, y_train, DATAPATH + '/Normal_total.libsvm') dump_svmlight_file(X_train, Y_train, DATAPATH + '/Normal_train.libsvm')
def get_matrix_of_concatenated_document_embeddings(embeddings, n_dim, texts, token_limit=20, stop_words=[''], scale=False): """ :param embeddings: :param n_dim: :param texts: :param n_tokens: :param stop_words: :param scale: :return: """ scaler = preprocessing.MaxAbsScaler() # scaler = preprocessing.MinMaxScaler() tokenizer = WordPunctTokenizer() matrix = np.zeros((len(texts), token_limit*n_dim)) for i_texts in range(0, len(texts)): tokens = tokenizer.tokenize(texts[i_texts]) tmp = [] for i_token in range(0, token_limit): cur_embedding = [0] * n_dim # if text still has tokens left, the current token is in the embeddings, and it is not on the stop word list if i_token < len(tokens) and tokens[i_token] in embeddings.keys() and not tokens[i_token] in stop_words: tmp_embedding = scaler.fit_transform(embeddings[tokens[i_token]]) if scale else embeddings[tokens[i_token]] cur_embedding = tmp_embedding.tolist() tmp += cur_embedding matrix[i_texts] = np.array(tmp) return matrix
def cadata(data_path=home+'/datasets/cadata/cadata'): """Reported performance: http://www.jmlr.org/papers/volume18/15-025/15-025.pdf http://www.stat.cmu.edu/~cshalizi/350/hw/solutions/solutions-06.pdf """ try: open(data_path, 'r') except FileNotFoundError as e: print(str(e)) print("Download cadata from https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression.html") return None, None, None, None X, Y = datasets.load_svmlight_file(data_path) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42) X_train = X_train.toarray() X_test = X_test.toarray() y_m = np.mean(Y_train) y_s = np.std(Y_train) Y_train = (Y_train-y_m) Y_test = (Y_test-y_m) s = preprocessing.MaxAbsScaler() X_train = s.fit_transform(X_train) X_test = s.transform(X_test) X_train = preprocessing.normalize(X_train) X_test = preprocessing.normalize(X_test) return X_train, Y_train, X_test, Y_test
def abalone(data_path=home+'/datasets/abalone/abalone'): try: open(data_path, 'r') except FileNotFoundError as e: print(str(e)) print("Download abalone from https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression.html") return None, None, None, None X, Y = datasets.load_svmlight_file(data_path) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42) X_train = X_train.toarray() X_test = X_test.toarray() y_m = np.mean(Y_train) y_s = np.std(Y_train) Y_train = (Y_train-y_m) Y_test = (Y_test-y_m) s = preprocessing.MaxAbsScaler() X_train = s.fit_transform(X_train) X_test = s.transform(X_test) X_train = preprocessing.normalize(X_train) X_test = preprocessing.normalize(X_test) return X_train, Y_train, X_test, Y_test
def compute_dimentionality_reduction_embedding(x, y): print('\nComputing dimentionality reduction embedding.\n') perplexity = 20 n_components = 3 embedding = manifold.TSNE(n_components=n_components, perplexity=perplexity, init='pca', random_state=0) x_tSNE = embedding.fit_transform(x) # Normalize data. max_abs_scaler = preprocessing.MaxAbsScaler() x_tSNE_normalized = max_abs_scaler.fit_transform(x_tSNE) if n_components == 2: plot_data_2d(x_tSNE_normalized, y, markersize=2, alpha=1.0, auto_limit_enabled=False) elif n_components == 3: plot_data_3d(x_tSNE_normalized, y) return x_tSNE_normalized
def classifyTest(self): #y, x = svm_read_problem('Default+Up+Down_30') #means, stdevs = self.calcMeansStdevs(x) #m = svm_train(y[:90], x[:90], '-s 0 -t 1') x_train, y_train = load_svmlight_file('Default+Up+Down+Left+Right') scaler = preprocessing.MaxAbsScaler() x_scaled = scaler.fit_transform(x_train) clf = svm.SVC(kernel='poly') clf.fit(x_scaled, y_train) for i in xrange(200): x = [0] * np.shape(x_train)[1] #dict() count = 0 for j in xrange(8 - 1, -1, -1): for k in xrange(j - 1, -1, -1): self.connManager.connectElectrodes(j, k) impedance = self.doFreqSweep() x[count] = impedance count += 1 numBasicFeats = 28 for j in xrange(numBasicFeats): for k in xrange(j - 1): diff = abs(x[j] - x[k]) x[count] = diff count += 1 x_s = scaler.transform(x) #print x_s print clf.predict([x_s]) #p_labs, p_acc, p_vals = svm_predict([0],[x],m) #print p_labs '''
def mushrooms(data_path=home+'/datasets/mushrooms/mushrooms'): try: open(data_path, 'r') except FileNotFoundError as e: print(str(e)) print("Download phishing from https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/mushrooms") return None, None, None, None X, Y = datasets.load_svmlight_file(data_path) X = X.toarray() Y[Y == 2] = -1 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42) s = preprocessing.MaxAbsScaler() X_train = s.fit_transform(X_train) X_test = s.transform(X_test) X_train = preprocessing.normalize(X_train) X_test = preprocessing.normalize(X_test) return X_train, Y_train, X_test, Y_test
def get_fitness(genes, texts, labels): features = np.asarray([0 for _ in labels]) acc = 0 pool = mp.Pool(mp.cpu_count() - 2) for idx, text in enumerate(texts): pool.apply_async(calc_text_score, (genes, text, features, idx)) # calculate accuracy max_abs_scaler = preprocessing.MaxAbsScaler() scaled_train_data = max_abs_scaler.fit_transform(features) skf = StratifiedKFold(5, True, 2019) missfits = [] for train, test in skf.split(features, labels): clf = CalibratedClassifierCV(OneVsRestClassifier(SVC(C=1))) train_labels = [l for idx, l in enumerate(labels) if idx in train] test_labels = [l for idx, l in enumerate(labels) if idx in test] clf.fit(scaled_train_data[train], train_labels) predictions = clf.predict(scaled_train_data[test]) proba = clf.predict_proba(scaled_train_data[test]) # Reject option (used in open-set cases) for i, p in enumerate(predictions): sproba = sorted(proba[i], reverse=True) if sproba[0] - sproba[1] < 0.1: predictions[i] = u'<UNK>' missfits.extend([idx for idx, v in enumerate(labels) if v != labels[scaled_train_data in train][idx]]) return acc, missfits
def scale_x(feature_files): min_max_scaler = preprocessing.MaxAbsScaler() for f in feature_files: x, y = datasets.load_svmlight_file(f) x_scale = np.round(min_max_scaler.fit_transform(x), 4) datasets.dump_svmlight_file(x_scale, y, f) print str(f) + " finished."
def __init__(self): super().__init__('./feature_save/user_features_{}_{}.pkl') self.shop_to_index = None self.user_counter = None self.total_counter = None self.norm = 1 self._scaler = preprocessing.MaxAbsScaler()
def normalize(matrix): """Matrix normalization Parameters ---------- matrix: numpy matrix Returns ------- X_scale_maxabs: numpy matrix rescaled matrix """ # For details in this normalization, see: # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html # Scale each feature by its maximum absolute value. # This estimator scales and translates each feature individually such # that the maximal absolute value of each feature in the training set will be 1.0. # It does not shift/center the data, and thus does not destroy any sparsity. # This is the data I want to scale X_scale = np.copy(matrix) # This is the one I can use for the HCP max_abs_scaler = preprocessing.MaxAbsScaler() X_scale_maxabs = max_abs_scaler.fit_transform(X_scale) return X_scale_maxabs #X_train_minmax
def preprocessed_data(train_path = 'data/d_train_20180102.csv', test_path = 'data/d_test_B_20180128.csv'): """ :return: 返回预处理过的数据 """ train, testB = raw_data(train_path,test_path) train_id = train.id.values.copy() feature_columns = [f for f in train.columns if f not in ['id', '血糖']] test_idB = testB.id.values.copy() data = pd.concat([train, testB]) # 类别映射 data['性别'] = data['性别'].map({'男': 1, '女': 0}) # 日期映射 data['体检日期'] = pd.to_datetime(data['体检日期']).apply(lambda a: a.dayofyear) # 缺失值处理 data.fillna(data.median(axis=0), inplace=True) # 归一化 scaler = preprocessing.MaxAbsScaler() data[feature_columns] = scaler.fit_transform(data[feature_columns]) train_feat = pd.DataFrame(data[data.id.isin(train_id)]) test_featB = pd.DataFrame(data[data.id.isin(test_idB)]) test_featB.drop(labels=['血糖'], axis=1, inplace=True) return train_feat, test_featB
def bin_benchmark(train, test, test_labels, lambdav=3, normalize=True, lap_vec=None, bins_size=range(2, 10), k=20): if normalize: max_abs_scaler = preprocessing.MaxAbsScaler() standard_scaler = preprocessing.StandardScaler(with_std=False) train = max_abs_scaler.fit_transform( standard_scaler.fit_transform(train)) test = max_abs_scaler.transform(standard_scaler.transform(test)) aurocs = np.zeros(len(bins_size)) data_dim = train.shape[1] for i, b in enumerate(bins_size): bins = [b] * data_dim loop_wdbc = Loop(train, lambdav=lambdav, k=k, lap_vec=lap_vec, bins=bins) aurocs[i] = sklearn.metrics.roc_auc_score( test_labels, loop_wdbc.query_loop(test)) return aurocs
def get_scaler(scale_method='StandardScaler'): """ Get different kinds of scalers from scikit-learn :param scale_method: scale method :returns: scaler instance :raises: none """ scaler = None if scale_method == 'StandardScaler': scaler = preprocessing.StandardScaler() elif scale_method == 'MinMaxScaler': scaler = preprocessing.MinMaxScaler() elif scale_method == 'MaxAbsScaler': scaler = preprocessing.MaxAbsScaler() elif scale_method == 'RobustScaler': scaler = preprocessing.RobustScaler() elif scale_method == 'QuantileTransformer': scaler = preprocessing.QuantileTransformer() elif scale_method == 'Normalizer': scaler = preprocessing.Normalizer() elif scale_method == 'PowerTransformer': scaler = preprocessing.PowerTransformer() else: print(scale_method, ' not found') return scaler
def benchmark(train, test, test_labels, ks, lambdav=3, normalize=True, lap_vec=None, bins=None): if normalize: max_abs_scaler = preprocessing.MaxAbsScaler() standard_scaler = preprocessing.StandardScaler(with_std=False) train = max_abs_scaler.fit_transform( standard_scaler.fit_transform(train)) test = max_abs_scaler.transform(standard_scaler.transform(test)) aurocs = np.zeros(len(ks)) for i, k in enumerate(ks): loop_wdbc = Loop(train, lambdav=lambdav, k=k, lap_vec=lap_vec, bins=bins) aurocs[i] = sklearn.metrics.roc_auc_score( test_labels, loop_wdbc.query_loop(test)) return aurocs
def demo(): print("Loading model...") model = torch.load(MODEL_PATH, map_location="cpu")['model'] scaler = preprocessing.MaxAbsScaler() mix = np.zeros(64000, dtype=np.float32) mix_r = np.zeros(64000, dtype=np.float32) names = [] for i in range(4): data, data_r, name = collect_data(i) mix += data mix_r += data_r names.append(name[:-1]) mix = scaler.fit_transform(mix.reshape(-1, 1)).T mix_r = scaler.fit_transform(mix_r.reshape(-1, 1)).T print("The mixture is stored as mix.wav") sf.write('./mix.wav', mix[0, :], samplerate=16000) mix = torch.tensor(mix).unsqueeze(0) mix_r = torch.tensor(mix_r).unsqueeze(0) print("Processing...") features, features_ = model(mix_r, mix) distances = [] for j in range(4): d = (features_[j] - features[j]).pow(2).sum(1) distances.append(d.item()) # features_r, features = model([mix, mix_r]) print("----------------------") print("The anomalys score for pump, slider, fan and valve are:") print(distances) print("The sources are:...") print(names)
def svmTestBySample(dataset, descriptor, space, channel, illuminant="IIC", testFolds=[5]): nameSpace, nameChannel = sc.getSpaceChannelName(space, channel) tt = descriptor.upper() # Loading Test Data fd = "" for i in testFolds: fd = fd + str(i) + "-" fd = fd[:-1] outfile = "../training-test-files/" + tt + "-" + illuminant + "-" + nameSpace + "-" + nameChannel + "/" + dataset + "-SVM-test-folds-" + fd ft, lb = readTrainingTestFiles(outfile) testMatrixF = np.array(ft) testMatrixL = np.array(lb) #Scale Train Features #testMatrixFScaled = preprocessing.scale(testMatrixF) #Scale features between [-1,1] max_abs_scaler = preprocessing.MaxAbsScaler() testMatrixFScaled = max_abs_scaler.fit_transform(testMatrixF) npath = "../models/" + tt + "-" + illuminant + "-" + nameSpace + "-" + nameChannel + "/" modelName = npath + "model-" + dataset + "-" + tt + "-" + illuminant + "-" + nameSpace + "-" + nameChannel + ".pkl" clf = joblib.load(modelName) outLabels = clf.predict(testMatrixFScaled) scores = clf.score(testMatrixFScaled, testMatrixL) return (outLabels, scores)