def RBM_ensemble(predict_all): ''' RBM Ensemble method ''' #Convert '-1's to '0' predict_all[predict_all==-1]=0 if ENSEMBLE_TRAIN==True: #Train RBM rbm = BernoulliRBM(n_components = N_COMPONENTS, n_iter=N_ITERATIONS, learning_rate=LEARNING_RATE, batch_size=BATCH_SIZE) print(f'\nStarting RBM training.... {datetime.datetime.now().time()}') rbm.fit(predict_all) print(f'\nRBM training complete.... - {datetime.datetime.now().time()}') #dump(rbm, 'rbm_ensemble_OLAK.joblib', compress=True) #Save RBM model anomalous_test_predict_all = predict_all[5855:,:] normal_test_predict_all = predict_all[0:5855,:] elif ENSEMBLE_TRAIN==False: rbm = load('rbm_ensemble_OLAK.joblib') #Load stored RBM model anomalous_test_predict_all = predict_all[52692:,:] normal_test_predict_all = predict_all[0:52692,:] print(f'Sizes - {anomalous_test_predict_all.shape}') anomalous_test_probability = rbm.transform(anomalous_test_predict_all) normal_test_probability = rbm.transform(normal_test_predict_all) print(f'\nRBM complete.... - {datetime.datetime.now().time()}') true_positives = int((anomalous_test_probability[anomalous_test_probability<=T1].shape[0])/N_COMPONENTS) false_negatives = int((anomalous_test_probability[T1<anomalous_test_probability].shape[0])/N_COMPONENTS) false_positives = int((normal_test_probability[normal_test_probability<=T1].shape[0])/N_COMPONENTS) true_negatives = int((normal_test_probability[T1<normal_test_probability].shape[0])/N_COMPONENTS) MLmodel_evaluation(true_positives, false_positives, false_negatives, true_negatives)
def testRBM(): X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) print X model = BernoulliRBM(n_components=2) model.fit(X) print dir(model) print model.transform(X) print model.score_samples(X) print model.gibbs
def words_to_vec(df): print("Method: words_to_vec. Working on words to vecs....") buzzCount = CountVectorizer(stop_words='english', max_features=50, ngram_range=(1, 1), token_pattern=u'.*_.*') buzzCount_te_sparse = buzzCount.fit_transform(df["buzzers"]) buzzTFid = TfidfVectorizer(stop_words='english', max_features=500, ngram_range=(2, 9)) buzzTFid_te_sparse = buzzTFid.fit_transform(df["description"]) _boltzman = BernoulliRBM(n_components=35) _boltzman.fit(buzzTFid_te_sparse) buzzTFid_boltzman = _boltzman.transform(buzzTFid_te_sparse) buzzCount_df = pd.DataFrame(buzzCount_te_sparse.toarray(), columns=buzzCount.get_feature_names()) buzzTFid_boltzman_cols = ['buzz_boltz_' + str(ag) for ag in range(1, buzzTFid_boltzman.shape[1] + 1)] buzzTFid_boltzman_df = pd.DataFrame(buzzTFid_boltzman, columns=buzzTFid_boltzman_cols) df = pd.concat([df, buzzCount_df, buzzTFid_boltzman_df], axis=1) #fagg = FeatureAgglomeration(n_clusters=100) #fagg.fit(buzzTFid_te_sparse.toarray()) #buzzTFid_fagg = fagg.transform(buzzTFid_te_sparse.toarray()) #buzzCount_df = pd.DataFrame(buzzCount_te_sparse.toarray(), columns=buzzCount.get_feature_names()) #buzzTFid_fagg_cols = ['buzz_fagg' + str(ag) for ag in range(1, buzzTFid_fagg.shape[1] + 1)] #buzzTFid_fagg_df = pd.DataFrame(buzzTFid_fagg, columns=buzzTFid_fagg_cols) #df = pd.concat([df, buzzTFid_fagg_df], axis=1) print("Method: words_to_vec. Returning words to vecs....") return df
def pretrain(self, save=True): visual_layer = self.data for i in range(len(self.hidden_sizes)): print("[DBN] Layer {} Pre-Training".format(i + 1)) rbm = BernoulliRBM(n_components=self.hidden_sizes[i], n_iter=self.rbm_iters[i], learning_rate=self.rbm_learning_rate[i], verbose=True, batch_size=32) rbm.fit(visual_layer) self.rbm_weights.append(rbm.components_) self.rbm_biases.append(rbm.intercept_hidden_) self.rbm_h_act.append(rbm.transform(visual_layer)) visual_layer = self.rbm_h_act[-1] if save: with open(self.outdir + "rbm_weights.p", 'wb') as f: pickle.dump(self.rbm_weights, f) with open(self.outdir + "rbm_biases.p", 'wb') as f: pickle.dump(self.rbm_biases, f) with open(self.outdir + "rbm_hidden.p", 'wb') as f: pickle.dump(self.rbm_h_act, f)
def test_nn(folder='data_270_json'): all_data = put_together(folder) vec = DictVectorizer() all_detects_vec = vec.fit_transform(all_data['defects']) model = BernoulliRBM() model.fit(all_detects_vec) ready = [] for fn in os.listdir(folder): data = None fullname = os.path.join(folder, fn) if os.path.isfile(fullname): with open(fullname) as f: try: data = json.load(f) except: pass if data: fe = get_features(data) if len(fe['defects']) > 0: vec = vec.transform(fe['defects']) p = model.transform(vec) data['vd'] = p.tolist() r = {} r['vzw'] = data['vzw'] r['defects'] = p.tolist() r['measurement'] = fe['measurement'] ready.append(r)
def rbm_001(): s = 15 crop = 150 n_patches = 400000 rf_size = 5 train_x_crop_scale = CropScaleImageTransformer(training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() images = images.reshape((images.shape[0], 15 * 15 * 3)) # rbm needs inputs to be between 0 and 1 scaler = MinMaxScaler() images = scaler.fit_transform(images) # Training takes a long time, says 80 seconds per iteration, but seems like longer # And this is only with 256 components rbm = BernoulliRBM(verbose=1) rbm.fit(images) train_x = rbm.transform(images) train_y = classes.train_solutions.data # 0.138 CV on 50% of the dataset wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def test_transform(): X = Xdigits[:100] rbm1 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42) rbm1.fit(X) Xt1 = rbm1.transform(X) Xt2 = rbm1._mean_hiddens(X) assert_array_equal(Xt1, Xt2)
def testRBM(): X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) print(X) model = BernoulliRBM(n_components=2) model.fit(X) print(dir(model)) print(model.transform(X)) print(model.score_samples(X)) print(model.gibbs)
class DeepRbmMnistClassifier: def __init__(self): self.n_components_first = 500 self.n_components_second = 500 self.n_components_third = 2000 self.n_iter_first = 20 self.n_iter_second = 20 self.n_iter_third = 20 self.learning_rate_first = 0.06 self.learning_rate_second = 0.06 self.learning_rate_third = 0.06 self.verbose = True def label_to_feature(self, y): feature = [0] * 10 feature[y] = 1 return feature def fit(self, X, y): self.rbm_1 = BernoulliRBM(verbose=self.verbose, n_components=self.n_components_first, n_iter=self.n_iter_first, learning_rate=self.learning_rate_first) self.rbm_2 = BernoulliRBM(verbose=self.verbose, n_components=self.n_components_second, n_iter=self.n_iter_second, learning_rate=self.learning_rate_second) self.first_pipeline = Pipeline( steps=[('rbm_1', self.rbm_1), ('rbm_2', self.rbm_2)]) self.first_pipeline.fit(X, y) # TODO improve. Look at how it is done in classify new_features = [] for example, label in zip(X, y): transformed = self.first_pipeline.transform(example)[0] new_features.append( np.concatenate((transformed, self.label_to_feature(label)))) self.rbm_3 = BernoulliRBM(verbose=self.verbose, n_components=self.n_components_third, n_iter=self.n_iter_third, learning_rate=self.learning_rate_third) self.rbm_3.fit(new_features, y) def classify(self, X): transformed = self.first_pipeline.transform(X) transformed = np.concatenate( (transformed, [[0] * 10] * len(transformed)), axis=1) # The inverse of rbm_3 to go from hidden layer to visible layer rbm_aux = BernoulliRBM() rbm_aux.intercept_hidden_ = self.rbm_3.intercept_visible_ rbm_aux.intercept_visible_ = self.rbm_3.intercept_hidden_ rbm_aux.components_ = np.transpose(self.rbm_3.components_) results = rbm_aux.transform(self.rbm_3.transform(transformed)) results = results[:, -10:] return np.argmax(results, axis=1)
class DeepRbmMnistClassifier: def __init__(self): self.n_components_first = 500 self.n_components_second = 500 self.n_components_third = 2000 self.n_iter_first = 20 self.n_iter_second = 20 self.n_iter_third = 20 self.learning_rate_first = 0.06 self.learning_rate_second = 0.06 self.learning_rate_third = 0.06 self.verbose = True def label_to_feature(self,y): feature = [0]*10 feature[y] = 1 return feature def fit(self,X,y): self.rbm_1 = BernoulliRBM(verbose=self.verbose, n_components=self.n_components_first, n_iter=self.n_iter_first, learning_rate=self.learning_rate_first) self.rbm_2 = BernoulliRBM(verbose=self.verbose, n_components=self.n_components_second, n_iter=self.n_iter_second, learning_rate=self.learning_rate_second) self.first_pipeline = Pipeline(steps=[('rbm_1',self.rbm_1), ('rbm_2',self.rbm_2)]) self.first_pipeline.fit(X,y) # TODO improve. Look at how it is done in classify new_features = [] for example,label in zip(X,y): transformed = self.first_pipeline.transform(example)[0] new_features.append(np.concatenate((transformed,self.label_to_feature(label)))) self.rbm_3 = BernoulliRBM(verbose=self.verbose, n_components=self.n_components_third, n_iter=self.n_iter_third, learning_rate=self.learning_rate_third) self.rbm_3.fit(new_features,y) def classify(self,X): transformed = self.first_pipeline.transform(X) transformed = np.concatenate((transformed,[[0]*10]*len(transformed)),axis=1) # The inverse of rbm_3 to go from hidden layer to visible layer rbm_aux = BernoulliRBM() rbm_aux.intercept_hidden_ = self.rbm_3.intercept_visible_ rbm_aux.intercept_visible_ = self.rbm_3.intercept_hidden_ rbm_aux.components_ = np.transpose(self.rbm_3.components_) results = rbm_aux.transform(self.rbm_3.transform(transformed)) results = results[:,-10:] return np.argmax(results,axis=1)
class BernoulliRBMSearchEngine(SmartSearchEngine): # # Registry implementation using ball-tree def __init__(self): super(BernoulliRBMSearchEngine, self).__init__() self._service_array = [] self._bernoulliRBM_index = None self._tfidf_matrix = None def load_configuration(self, configuration_file): super(BernoulliRBMSearchEngine, self).load_configuration(configuration_file) self._vectorizer = TfidfVectorizer( sublinear_tf=False, analyzer='word', lowercase=False, use_bm25idf=self._use_bm25idf, bm25_tf=self._use_bm25tf, k=self._bm25_k, preprocessor=StringPreprocessorAdapter()) def unpublish(self, service): pass def _preprocess(self, bag_of_words): return bag_of_words.get_words_str() def _after_publish(self, documents): self._tfidf_matrix = self._vectorizer.fit_transform(documents) self._bernoulliRBM = BernoulliRBM(learning_rate=1) self._rbm_matrix = self._bernoulliRBM.fit_transform(self._tfidf_matrix) self._bernoulliRBM_index = NearestNeighbors(len(self._service_array), algorithm='brute', metric='euclidean') self._bernoulliRBM_index.fit(self._rbm_matrix) def publish(self, service): pass def find(self, query): query = StringTransformer().transform(query) query_array = self._vectorizer.transform( [self._query_transformer.transform(query).get_words_str()]) query_array = self._bernoulliRBM.transform(query_array.toarray()) result = self._bernoulliRBM_index.kneighbors(query_array, return_distance=False)[0] result_list = [] for index in result: result_list.append(self._service_array[index]) return result_list def number_of_services(self): pass
def scale(params): xTrain = params[0] yTrain = params[1] xValid = params[2] print 'neuralizing' global neural neural = Bernoulli() xTrain = neural.fit_transform(xTrain, yTrain) xValid = neural.transform(xValid) return [xTrain, yTrain, xValid]
def classify(self,X): transformed = self.first_pipeline.transform(X) transformed = np.concatenate((transformed,[[0]*10]*len(transformed)),axis=1) # The inverse of rbm_3 to go from hidden layer to visible layer rbm_aux = BernoulliRBM() rbm_aux.intercept_hidden_ = self.rbm_3.intercept_visible_ rbm_aux.intercept_visible_ = self.rbm_3.intercept_hidden_ rbm_aux.components_ = np.transpose(self.rbm_3.components_) results = rbm_aux.transform(self.rbm_3.transform(transformed)) results = results[:,-10:] return np.argmax(results,axis=1)
def classify(self, X): transformed = self.first_pipeline.transform(X) transformed = np.concatenate( (transformed, [[0] * 10] * len(transformed)), axis=1) # The inverse of rbm_3 to go from hidden layer to visible layer rbm_aux = BernoulliRBM() rbm_aux.intercept_hidden_ = self.rbm_3.intercept_visible_ rbm_aux.intercept_visible_ = self.rbm_3.intercept_hidden_ rbm_aux.components_ = np.transpose(self.rbm_3.components_) results = rbm_aux.transform(self.rbm_3.transform(transformed)) results = results[:, -10:] return np.argmax(results, axis=1)
class BernoulliRBMImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def temp(features): [featuresNorm, MAX, MIN] = normalizeFeatures(features) [X, Y] = listOfFeatures2Matrix(featuresNorm) rbm = BernoulliRBM(n_components = 10, n_iter = 1000, learning_rate = 0.01, verbose = False) X1 = X[0::2] X2 = X[1::2] Y1 = Y[0::2] Y2 = Y[1::2] rbm.fit(X1,Y1) YY = rbm.transform(X1) for i in range(10):plt.plot(YY[i,:],'r') for i in range(10):plt.plot(YY[i+10,:],'g') for i in range(10):plt.plot(YY[i+20,:],'b') plt.show()
def _RBM(self, X, y): from sklearn.neural_network import BernoulliRBM # PCA model creation, number of components # feature extraction method. Used here (after sampling) because we are # creating an universal model and not this_dataset-specific. neural_network = BernoulliRBM(n_components=self.k_features) neural_network.fit(X, y) X = neural_network.transform(X) self.feature_reduction_method = neural_network return X
def pretraining(self): input_layer = self.x_train for i in range(len(self.hidden_layer)): print("DBN Layer {0} Pre-training".format(i + 1)) rbm = BernoulliRBM(n_components=self.hidden_layer[i], learning_rate=self.learning_rate_rbm, batch_size=self.batch_size_rbm, n_iter=self.n_epochs_rbm, verbose=self.verbose_rbm, random_state=self.verbose_rbm) rbm.fit(input_layer) # size of weight matrix is [input_layer, hidden_layer] self.weight_rbm.append(rbm.components_.T) self.bias_rbm.append(rbm.intercept_hidden_) input_layer = rbm.transform(input_layer) print('Pre-training finish.')
def train_ca_cd(type, X_train, y_train, X_test, y_test): input_layer = X_train hidden_layer = [250, 500, 200] weight_rbm = [] bias_rbm = [] for i in range(len(hidden_layer)): print("DBN Layer {0} Pre-training".format(i + 1)) rbm = BernoulliRBM(n_components=hidden_layer[i], learning_rate=0.0005, batch_size=512, n_iter=200, verbose=2, random_state=1) rbm.fit(input_layer) # size of weight matrix is [input_layer, hidden_layer] weight_rbm.append(rbm.components_.T) bias_rbm.append(rbm.intercept_hidden_) input_layer = rbm.transform(input_layer) print('Pre-training finish.', np.shape(weight_rbm[0]), np.shape(bias_rbm[0])) test_rms = 0 result = [] model = Sequential() print('Fine-tuning start.') for i in range(0, len(hidden_layer)): print('i:', i) if i == 0: model.add( Dense(hidden_layer[i], activation='sigmoid', input_dim=np.shape(X_train)[1])) elif i >= 1: model.add(Dense(hidden_layer[i], activation='sigmoid')) else: pass layer = model.layers[i] layer.set_weights([weight_rbm[i], bias_rbm[i]]) # model.add(Dense(np.shape(yTrain)[1], activation='linear')) model.add( Dense(1, activation='linear', kernel_regularizer=regularizers.l2(0.01))) # sgd = SGD(lr=0.005, decay=0) model.compile(loss='mse', optimizer="rmsprop") # sgd model.fit(X_train, y_train, batch_size=150, epochs=100, verbose=5) model.save('../model/dwt_dbn_' + type + '_100.h5') print('Fine-tuning finish.') return model
def trainRBM_SVM(features, Cparam, nComponents): [X, Y] = listOfFeatures2Matrix(features) rbm = BernoulliRBM(n_components = nComponents, n_iter = 30, learning_rate = 0.2, verbose = True) rbm.fit(X,Y) newX = rbm.transform(X) # colors = ["r","g","b"] # for i in range(1,Y.shape[0],5): # plt.plot(newX[i,:], colors[int(Y[i])]) # plt.show() classifier = {} classifier["rbm"] = rbm svm = sklearn.svm.SVC(C = Cparam, kernel = 'linear', probability = True) svm.fit(newX,Y) classifier["svm"] = svm return classifier
def RBM_new(): ''' RBM to evaluate on artificially generated data ''' #Define datasize (val_length, test_length) = (11710, 105384) (train_half_length, test_half_length) = (int(val_length/2), int(test_length/2)) #Generate artificial Z print('\n##############VALIDATION DATA#################') (TL_val, Z_val) = Z_new_generator(val_length, classifier_stats_val) print('\n##############TEST DATA#################') (TL_test, Z_test) = Z_new_generator(test_length, classifier_stats_test) #Convert '-1's to '0' Z_val[Z_val==-1]=0 Z_test[Z_test==-1]=0 #Train RBM rbm = BernoulliRBM(n_components = N_COMPONENTS, n_iter=N_ITERATIONS, learning_rate=LEARNING_RATE, batch_size=BATCH_SIZE) print(f'\nStarting RBM training.... {datetime.datetime.now().time()}') Z_val_probability = rbm.fit_transform(Z_val) Z_test_probability = rbm.transform(Z_test) print(f'\nRBM complete.... - {datetime.datetime.now().time()}') #Convert probability to values Z_val_final = np.sign(Z_val_probability - T1) Z_test_final = np.sign(Z_test_probability - T1) #RBM on validation data print(f'\n\n****VALIDATION RBM RESULTS*****') true_positives = sum(Z_val_final[train_half_length:]==-1) false_negatives = sum(Z_val_final[train_half_length:]==1) false_positives = sum(Z_val_final[0:train_half_length]==-1) true_negatives = sum(Z_val_final[0:train_half_length]==1) MLmodel_evaluation(true_positives, false_positives, false_negatives, true_negatives) #RBM on test data print(f'\n\n****TEST RBM RESULTS*****') true_positives = sum(Z_test_final[test_half_length:]==-1) false_negatives = sum(Z_test_final[test_half_length:]==1) false_positives = sum(Z_test_final[0:test_half_length]==-1) true_negatives = sum(Z_test_final[0:test_half_length]==1) MLmodel_evaluation(true_positives, false_positives, false_negatives, true_negatives)
def RBM(): filename = "../data/smaller.dta" raw_data = open(filename, 'rt') data = np.loadtxt(raw_data, delimiter=" ") X = data[:, :3] Y = data[:, 3] print(X) print(Y) print("training on RBM") rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.06 rbm.n_iter = 20 rbm.n_components = 100 rbm.fit(X, Y) predictions = rbm.transform(X) params = rbm.get_params() print("predictions = ", predictions) print("rbm = ", rbm) print("params = ", params)
def add_Brbm(Visible, components, rs, learning_rate, verbose=None, n_iter=None): rbm = BernoulliRBM(n_components=components, random_state=rs, learning_rate=learning_rate, verbose=False, n_iter=50) rbm.fit(Visible) rbm_data = { 'coefs': np.transpose(np.array(rbm.components_)), 'bias': np.array(rbm.intercept_hidden_), 'hidden': rbm.transform(Visible) } return rbm_data
def pretrain(self, save=True): visual_layer = self.train_x #训练集 for i in range(len(self.hidden_sizes)): print("[DBN] Layer {} Pre-Training".format(i + 1)) rbm = BernoulliRBM(n_components=self.hidden_sizes[i], n_iter=self.rbm_iters, learning_rate=self.rbm_learning_rate, random_state=16, verbose=0, batch_size=2048) rbm.fit(visual_layer) #训练 self.rbm_weights.append(rbm.components_) #权重矩阵 self.rbm_biases.append(rbm.intercept_hidden_) self.rbm_h_act.append(rbm.transform(visual_layer)) visual_layer = self.rbm_h_act[-1]
def rbm_001(): s = 15 crop = 150 n_patches = 400000 rf_size = 5 train_x_crop_scale = CropScaleImageTransformer( training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() images = images.reshape((images.shape[0], 15 * 15 * 3)) # rbm needs inputs to be between 0 and 1 scaler = MinMaxScaler() images = scaler.fit_transform(images) # Training takes a long time, says 80 seconds per iteration, but seems like longer # And this is only with 256 components rbm = BernoulliRBM(verbose=1) rbm.fit(images) train_x = rbm.transform(images) train_y = classes.train_solutions.data # 0.138 CV on 50% of the dataset wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def pretrain(self): self.weight_rbm = [] self.bias_rbm = [] x_train = self.x_train y_train = self.y_train hidden_layer_structure = self.get_hidden_layer_structure() input_layer = x_train for i in range(len(hidden_layer_structure)): rbm = BernoulliRBM(n_components=hidden_layer_structure[i], learning_rate=self.learning_rate_rbm, batch_size=self.batch_size_rbm, n_iter=self.n_epochs_rbm, verbose=1, random_state=self.random_seed) rbm.fit(input_layer) self.weight_rbm.append(rbm.components_.T) self.bias_rbm.append(rbm.intercept_hidden_) input_layer = rbm.transform(input_layer) return
def pretrain(self): visual_layer = self.data print(self.data) for i in range(len(self.hidden_sizes)): print(visual_layer.shape) print("[DBN] Layer {} Pre-Training".format(i + 1)) rbm = BernoulliRBM(n_components=self.hidden_sizes[i], n_iter=self.rbm_iters, learning_rate=self.rbm_learning_rate, verbose=2, batch_size=64) rbm.fit(visual_layer) self.rbm_weights.append(rbm.components_) self.rbm_biases.append(rbm.intercept_hidden_) self.rbm_h_act.append(rbm.transform(visual_layer)) visual_layer = self.rbm_h_act[-1] print(visual_layer.shape) print(visual_layer)
#bigMatrixTrain = (bigMatrixTrain - np.min(bigMatrixTrain, 0)) / (np.max(bigMatrixTrain, 0) + 0.0001) # 0-1 scaling #Divide dataset for cross validation purposes X_train, X_test, y_train, y_test = cross_validation.train_test_split( bigMatrixTrain, y, test_size = 0.4, random_state = 0) #fix this # specify parameters and distributions to sample from # Models we will use rbm = BernoulliRBM(random_state=0, verbose=True) #classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) rbm.learning_rate = 0.04 rbm.n_iter = 30 # More components tend to give better prediction performance, but larger fitting time rbm.n_components = 300 X_train = rbm.fit_transform(X_train) X_test = rbm.transform(X_test) # Train a logistic model print("Fitting the classifier to the training set") logisticModel = linear_model.LogisticRegression() t0 = time() param_grid = {'C': [10, 30, 100, 300, 1000]} logisticModel = GridSearchCV(logisticModel, param_grid = param_grid) logisticModel = logisticModel.fit(X_train, y_train) print("done in %0.3fs" % (time() - t0)) print("Best estimator found by grid search:") print(logisticModel.best_estimator_) #logistic.C = 6000.0 # Train a SVM classification model
x = x_all[:length_train] t = x_all[length_train:] label = np.array(label) length_test = len(test) n = label.shape[1] print "x shape",x.shape print "t shape",t.shape print "rbm" rbm = BernoulliRBM(n_components=2000,n_iter=20,batch_size=66) rbm.fit(x) x = rbm.transform(x) t = rbm.transform(t) print "rbm x shape",x.shape print "rbm t shape",t.shape #构造结果的矩阵 answer = [] print "开始回归" for i in range(n): print "第%s个"%(i) clf = linear_model.Ridge(alpha=2,fit_intercept=True,normalize=True,tol=1e-9)
gmm = GaussianMixture(n_components=NUM_DIM, max_iter=100, covariance_type='full', random_state=SEED) gmm.fit(X_train) X_train_gmm = gmm._estimate_weighted_log_prob(X_train) X_score_gmm = gmm._estimate_weighted_log_prob(X_score) # ====== rbm ====== # rbm = BernoulliRBM(n_components=NUM_DIM, batch_size=8, learning_rate=0.0008, n_iter=8, verbose=2, random_state=SEED) rbm.fit(X_train) X_train_rbm = rbm.transform(X_train) X_score_rbm = rbm.transform(X_score) # =========================================================================== # Deep Learning # =========================================================================== # =========================================================================== # Visualize # =========================================================================== def plot(train, score, title, applying_pca=False): if applying_pca: pca = PCA(n_components=NUM_DIM) pca.fit(train) train = pca.transform(train)
readCsvData() #print Train_X[0] #print final Test_X = Train_X[:15] Test_Y = Train_X[15:] print len(Test_X) #print Train_X #Remove all stopwords since all characters are taken #vectorizer=Tfidfvectorizer(stop_words=None) #Train_X=vectorizer.fit_transform(documents) X = np.array(Test_X) #print type(X) #print Train_X """num_Of_clusters=3 model= KMeans(n_clusters=num_Of_clusters,init='random',max_iter=1000,n_init=2) model.fit_transform(X) labels=model.labels_ order_centroids=model.cluster_centers_.argsort()[:, ::-1]""" """model=GMM(n_components=2) model.fit(X)""" model = BernoulliRBM(n_components=2) model.fit(X) #Predict the test label for new data. testLabels = model.transform(Test_Y) print testLabels
del rows folds = StratifiedKFold(information, n_folds=3) result = [] for train, test in folds: data_train = sentences[train] result_train = information[train] data_test = sentences[test] result_test = information[test] vectorizer = TfidfVectorizer(binary=True, norm=False, use_idf=False) rbm = BernoulliRBM() classifier = RandomForestClassifier() data_train = vectorizer.fit_transform(data_train) data_test = vectorizer.transform(data_test) data_train = rbm.fit_transform(data_train) data_test = rbm.transform(data_test) classifier.fit(data_train, result_train) print classificationError(classifier.predict(data_test), result_test) result.append(classifier.score(data_test, result_test)) print reduce(lambda x, y: x + y, result) / float(len(result))
#'MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTSADGNEKKIEMVRAYREKIEKELEAVCQDVLSLLDNYLIKNCSETQYESKVFYLKMKGDYYRYLAEVATGEKRATVVESSEKAYSEAHEISKEHMQPTHPIRLGLALNYSVFYYEIQNAPEQACHLAKTAFDDAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDQQDD', #'MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRMEPRAPWIEQEGPEYWDGETRKVKAHSQTHRVDLGTLRGYYNQSEAGSHTVQRMYGCDVGSDWRFLRGYHQYAYDGKDYIALKEDLRSWTAADMAAQTTKHKWEAAHVAEQLRAYLEGTCVEWLRRYLENGKETLQRTDAPKTHMTHHAVSDHEATLRCWALSFYPAEITLTWQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGQEQRYTCHVQHEGLPKPLTLRWEPSSQPTIPIVGIIAGLVLFGAVITGAVVAAVMWRRKSSDRKGGSYSQAASSDSAQGSDVSL', #'MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFYLKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD', #] comblength = 7 X = map(lambda s : np.array(createAAFreqVector(s,Lmap,comblength)) , seqs) #print X #X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001) # 0-1 scaling #print X.shape rbm.fit(X) ssss ='MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRMEPRAPWIEQEGPEYWDGETRKVKAHSQTHRVDLGTLRGYYNQSEAGSHTVQRMYGCDVGSDWRFLRGYHQYAYDGKDYIALKEDLRSWTAADMAAQTTKHKWEAAHVAEQLRAYLEGTCVEWLRRYLENGKETLQRTDAPKTHMTHHAVSDHEATLRCWALSFYPAEITLTWQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGQEQRYTCHVQHEGLPKPLTLRWEPSSQPTIPIVGIIAGLVLFGAVITGAVVAAVMWRRKSSDRKGGSYSQAASSDSAQGSDVSL' transformedSeq = rbm.transform(np.array(createAAFreqVector(ssss,Lmap,comblength))) print transformedSeq print 'len', len(transformedSeq) # Training RBM-Logistic Pipeline #classifier.fit(X_train, Y_train) # Training Logistic regression #logistic_classifier = linear_model.LogisticRegression(C=100.0) #logistic_classifier.fit(X_train, Y_train) ############################################################################### # Evaluation print() ###############################################################################
train_X = train_X.reshape((train_X.shape[0], before*77)) test_X = test_X.reshape((test_X.shape[0], before*77)) print(train_X.shape, train_y.shape, test_X.shape, test_y.shape) # dbn input_layer = train_X hidden_layer=[250,500,200] weight_rbm = [] bias_rbm = [] for i in range(len(hidden_layer)): print("DBN Layer {0} Pre-training".format(i + 1)) rbm = BernoulliRBM(n_components=hidden_layer[i],learning_rate=0.0005,batch_size=512,n_iter=200,verbose=2,random_state=1) rbm.fit(input_layer) # size of weight matrix is [input_layer, hidden_layer] weight_rbm.append(rbm.components_.T) bias_rbm.append(rbm.intercept_hidden_) input_layer = rbm.transform(input_layer) print('Pre-training finish.',np.shape(weight_rbm[0]),np.shape(bias_rbm[0])) test_rms = 0 result = [] model = Sequential() print('Fine-tuning start.') for i in range(0, len(hidden_layer)): print('i:',i) if i == 0: model.add(Dense(hidden_layer[i], activation='sigmoid',input_dim=np.shape(train_X)[1])) elif i >= 1: model.add(Dense(hidden_layer[i], activation='sigmoid')) else: pass layer = model.layers[i] layer.set_weights([weight_rbm[i], bias_rbm[i]])
def RBMtest01(): #利用RBM进行non-linear feature extraction #相对于直接进行logistic regression, RBM features 可以提高分类精度 import numpy as np import matplotlib.pyplot as plt from scipy.ndimage import convolve from sklearn import linear_model, datasets, metrics from sklearn.cross_validation import train_test_split from sklearn.neural_network import BernoulliRBM from sklearn.pipeline import Pipeline def nudge_dataset(X, Y): direction_vectors = [ [[0, 1, 0], [0, 0, 0], [0, 0, 0]], [[0, 0, 0], [1, 0, 0], [0, 0, 0]], [[0, 0, 0], [0, 0, 1], [0, 0, 0]], [[0, 0, 0], [0, 0, 0], [0, 1, 0]] ] shift = lambda x, w: convolve(x.reshape((8, 8)), mode = 'constant', weights = w).ravel() X = np.concatenate([X] + [np.apply_along_axis(shift, 1, X, vector) for vector in direction_vectors]) Y = np.concatenate([Y for _ in range(5)], axis = 0) return X, Y digits = datasets.load_digits() X = np.asarray(digits.data, 'float32') #这里应该就是进行了一下数据类型转换 a#list to array X, Y = nudge_dataset(X, digits.target) #相当于重新生成了5倍的X,Y #print np.max(X, 0) #print np.min(X, 0) X = (X - np.min(X, 0)) / (np.max(X, 0) - - np.min(X, 0) + 0.0001) # 0-1 scaling 这里做了归一化(每一维分别归一化) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0) print set(Y_train) #''' #新建模型 logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state = 0, verbose = True) #感觉这里的pipeline就是一个连续进行fit, transform的过程 #而rbm模型transform的结果是Latent representations of the data. classifier = Pipeline(steps = [('rbm', rbm), ('logistic', logistic)]) #Training #这里的参数是根据cross-validation选出来的 -- GridSearchCV rbm.learning_rate = 0.06 rbm.n_iter = 20 rbm.n_components = 100 #这里就是利用rbm 训练出100个特征 logistic.C = 6000 #rbm.fit(X_train, Y_train) rbm.fit(X_train) #rbm从数据的维数来看,首先是一个非监督的训练过程,就是从X_train中求出N个代表性的vector, #然后再把原始的X_trian投影到这N的向量上,获得X_train的新N维feature #与PCA类似 predicted_Y = rbm.transform(X_train) print rbm.components_ #rbm.components_是 100 * 64的矩阵 print len(rbm.components_) print len(rbm.components_[0]) print predicted_Y print len(predicted_Y) print len(predicted_Y[0]) print len(X_train) print len(X_train[0]) # Training RBM-Logistic Pipeline #相当于这里输入的还是每一维都进行了归一化之后的X_train #对应的Y_train还是0-9 表示label print "Start Training RBM-Logistic Pipeline" classifier.fit(X_train, Y_train) # Training Logistic regression, logistic_classifier = linear_model.LogisticRegression(C = 100.0) logistic_classifier.fit(X_train, Y_train) #Evaluation print "Logistic regression using RBM features: \n%s\n" %(metrics.classification_report(Y_test, classifier.predict(X_test))) print "Logistic regression using raw features: \n%s\n" %(metrics.classification_report(Y_test, logistic_classifier.predict(X_test))) #Plotting plt.figure(figsize = (4.2, 4)) for i, comp in enumerate(rbm.components_): plt.subplot(10, 10, i + 1) #这里获得的还是100个64维vector,然后把每一个vector都reshape到8*8显示出来 plt.imshow(comp.reshape(8,8), cmap=plt.cm.gray_r) plt.xticks(()) plt.yticks(()) plt.suptitle('100 components extracted by RBM', fontsize = 16) plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.23) plt.show()
# X_test = X_test[test_permut, :] # y_test = y_test[test_permut] # rbm learning # TODO: try to search better parametrs with grid search rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.1 rbm.n_iter = 30 rbm.n_components = 16 print X_train print X_train.shape rbm.fit(all_feats) X_train = np.concatenate((rbm.transform(X_train), X_train_preserved), 1) X_test = np.concatenate((rbm.transform(X_test), X_test_preserved), 1) print X_train print X_train.shape ens_lbls = [] ens_probs = [] # iterate over classifiers for name, clf in zip(names, classifiers): print "[{}] learning starting ...".format(name) clf.fit(X_train, y_train) print "[{}] learning finished".format(name) probs = clf.predict_proba(X_test)[:, [1]] dump_to_file(name+"_res_probs", ids, probs)
class BernoulliRBMComponent(AutoSklearnPreprocessingAlgorithm): def __init__(self, n_components: int = 256, learning_rate: float = 0.1, batch_size: int = 10, n_iter: int = 10, random_state=None): super().__init__() self.n_components = n_components self.learning_rate = learning_rate self.batch_size = batch_size self.n_iter = n_iter self.random_state = random_state def fit(self, X, Y=None): from sklearn.neural_network import BernoulliRBM self.n_components = int(self.n_components) self.learning_rate = float(self.learning_rate) self.batch_size = int(self.batch_size) self.n_iter = int(self.n_iter) self.preprocessor = BernoulliRBM(n_components=self.n_components, learning_rate=self.learning_rate, batch_size=self.batch_size, n_iter=self.n_iter, random_state=self.random_state) return self def transform(self, X): if self.preprocessor is None: raise NotImplementedError() return self.preprocessor.transform(X) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'BernoulliRBM', 'name': 'Bernoulli Restricted Bolzman Machine', 'handles_regression': True, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'handles_multioutput': True, 'is_deterministic': False, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (DENSE, UNSIGNED_DATA) } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): n_components = UniformIntegerHyperparameter("n_components", 1, 512, default_value=256) learning_rate = UniformFloatHyperparameter("learning_rate", 1e-5, 1., default_value=0.1) batch_size = UniformIntegerHyperparameter("batch_size", 1, 100, default_value=10) n_iter = UniformIntegerHyperparameter("n_iter", 2, 200, default_value=10) cs = ConfigurationSpace() cs.add_hyperparameters( [n_components, n_iter, learning_rate, batch_size]) return cs
class BoWFeature(BaseEstimator, TransformerMixin): def __init__(self, patch_num=10000, patch_size=(8, 8), sample_num = 300,\ n_components=256, learning_rate=0.03, n_iter=100, batch_size=100): self.patch_num = patch_num self.patch_size = patch_size self.sample_num = sample_num self.n_components = n_components self.learning_rate = learning_rate self.n_iter = n_iter self.batch_size = batch_size def fit(self, X, y=None): num = self.patch_num // X.size data = [] for item in X: img = imread(str(item[0])) img = img_as_ubyte(rgb2gray(img)) #img = self.binary(img) # 二值化 tmp = extract_patches_2d(img, self.patch_size, max_patches = num,\ random_state=np.random.RandomState()) data.append(tmp) data = np.vstack(data) data = data.reshape(data.shape[0], -1) data = np.asarray(data, 'float32') # 二值化后不需要0-1归化 data = data - np.min(data, 0) data = data/(np.max(data, 0) + 0.0001) # 0-1 scaling self.rbm = BernoulliRBM(n_components=self.n_components,\ learning_rate=self.learning_rate, \ n_iter=self.n_iter,\ batch_size=self.batch_size,\ verbose=True) self.rbm.fit(data) return self def transform(self, X): results = [] for sample in X: img = imread(str(sample[0])) img = img_as_ubyte(rgb2gray(img)) #img = self.binary(img) patches = extract_patches_2d(img, self.patch_size,\ max_patches = self.sample_num,\ random_state=np.random.RandomState()) patches = patches.reshape(patches.shape[0], -1) patches = np.asarray(patches, 'float32') patches = patches-np.min(patches, 0) patches = patches/(np.max(patches, 0) + 0.0001) patches = self.rbm.transform(patches) results.append(patches.sum(axis=0)) return np.vstack(results) def get_params(self, deep=True): return {"patch_num": self.patch_num, "sample_num":self.sample_num, "patch_size":self.patch_size, "learning_rate":self.learning_rate, "n_components":self.n_components, "n_iter":self.n_iter, "batch_size":self.batch_size} def set_params(self, **parameters): for parameter, value in parameters.items(): self.__setattr__(parameter, value) return self def binary(self, img): edge = sobel(img) thresh = threshold_otsu(edge) edge = edge>=thresh return edge.astype(np.int)
X_train_unlab = X_train0[N_LABEL:N_UNLAB] X_validation = mnist.validation.images[:N_CV] y_validation = mnist.validation.labels[:N_CV] X_test = mnist.test.images y_test = mnist.test.labels rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.03 rbm.n_iter = 10 # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = 500 print('\nRBM Training...') rbm.fit(X_train_unlab) # train by unlabelled data X_train_rbmfitted = rbm.transform(X_train_lab) X_validation_rbmfitted = rbm.transform(X_validation) X_test_rbmfitted = rbm.transform(X_test) gbm = lgb.LGBMClassifier(objective='multiclass', num_leaves=63, learning_rate=0.01, n_estimators=1000) gbm.fit( X_train_rbmfitted, y_train_lab, # train by labbelled data eval_set=[(X_validation_rbmfitted, y_validation)], eval_metric='multi_logloss', early_stopping_rounds=10) y_pred = gbm.predict(X_test_rbmfitted, num_iteration=gbm.best_iteration)
import numpy as np train = pandas.read_csv("train.csv") target = train["Survived"] m = Massager() train_array = m.transform(train, True) brbm = BernoulliRBM(n_components=3, learning_rate=0.01) trantrain = brbm.fit_transform(train_array) param_grid = dict(C=np.logspace(-10, 2, 13), gamma=np.logspace(-9, 3, 13)) grid = GridSearchCV(svm.SVC(), param_grid=param_grid) grid.fit(trantrain, target) C = grid.best_params_['C'] gamma = grid.best_params_['gamma'] classifier = svm.SVC(C=C, gamma=gamma) classifier.fit(trantrain, target) vscore = cross_val_score(classifier, train_array, target) print "Validation score: {0} sd: {1}".format(vscore.mean(), vscore.std()) test = pandas.read_csv("test.csv") answers = pandas.DataFrame(test["PassengerId"]) test_array = m.transform(test) trantest = brbm.transform(test_array) predictions = classifier.predict(trantest) print(classifier.score(trantrain, target)) answers['Survived'] = pandas.Series(predictions.astype(int)) answers.to_csv("solution_rbm_svm.csv", index=False)
# ====== plda ====== # plda = PLDA(n_phi=NUM_DIM, random_state=SEED) plda.fit(X_train, y_train) X_train_plda = plda.predict_log_proba(X_train) X_score_plda = plda.predict_log_proba(X_score) # ====== gmm ====== # gmm = GaussianMixture(n_components=NUM_DIM, max_iter=100, covariance_type='full', random_state=SEED) gmm.fit(X_train) X_train_gmm = gmm._estimate_weighted_log_prob(X_train) X_score_gmm = gmm._estimate_weighted_log_prob(X_score) # ====== rbm ====== # rbm = BernoulliRBM(n_components=NUM_DIM, batch_size=8, learning_rate=0.0008, n_iter=8, verbose=2, random_state=SEED) rbm.fit(X_train) X_train_rbm = rbm.transform(X_train) X_score_rbm = rbm.transform(X_score) # =========================================================================== # Deep Learning # =========================================================================== # =========================================================================== # Visualize # =========================================================================== def plot(train, score, title, applying_pca=False): if applying_pca: pca = PCA(n_components=NUM_DIM) pca.fit(train) train = pca.transform(train) score = pca.transform(score) plot_figure(nrow=6, ncol=12)