def test_sparse_and_verbose(): """ Make sure RBM works with sparse input when verbose=True """ old_stdout = sys.stdout sys.stdout = StringIO() from scipy.sparse import csc_matrix X = csc_matrix([[0.], [1.]]) rbm = BernoulliRBM(n_components=2, batch_size=2, n_iter=1, random_state=42, verbose=True) try: rbm.fit(X) s = sys.stdout.getvalue() # make sure output is sound assert_true( re.match( r"\[BernoulliRBM\] Iteration 1," r" pseudo-likelihood = -?(\d)+(\.\d+)?," r" time = (\d|\.)+s", s)) finally: sio = sys.stdout sys.stdout = old_stdout
def do_train( hdf='/home/yacc/packages/btc-trade-result-history/btc_all_in_one.h5', dataset='a'): """ TODO add some comments """ h = pd.HDFStore(hdf, 'r') df = h[dataset] h.close() X, y = gen_dataset_from_price(df, step=200, ahead=20, percent=0.01) print('\n data generated.') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) print('train test split done.') #params = {'learning_rate': 0.1,'n_iter':20} #reg_clf = GradientBoostingRegressor(verbose=True,**params) reg_clf = SGDRegressor(verbose=True, n_iter=100) clf_rbm1 = BernoulliRBM(n_components=1024, verbose=True) clf_rbm2 = BernoulliRBM(n_components=512, verbose=True) clf_rbm3 = BernoulliRBM(n_components=256, verbose=True) clf = Pipeline( steps=[('clf1', clf_rbm1), ('clf2', clf_rbm2), ('clf3', clf_rbm3), ('clf_last', reg_clf)]) print('start training') clf.fit(X_train, y_train) import datetime with open('clf_pipeline_pick.pkl', 'a+') as f: pickle.dump(clf, f) print('pickle done.')
def pretrain(self, save=True): visual_layer = self.data for i in range(len(self.hidden_sizes)): print("[DBN] Layer {} Pre-Training".format(i + 1)) rbm = BernoulliRBM(n_components=self.hidden_sizes[i], n_iter=self.rbm_iters[i], learning_rate=self.rbm_learning_rate[i], verbose=True, batch_size=32) rbm.fit(visual_layer) self.rbm_weights.append(rbm.components_) self.rbm_biases.append(rbm.intercept_hidden_) self.rbm_h_act.append(rbm.transform(visual_layer)) visual_layer = self.rbm_h_act[-1] if save: with open(self.outdir + "rbm_weights.p", 'wb') as f: pickle.dump(self.rbm_weights, f) with open(self.outdir + "rbm_biases.p", 'wb') as f: pickle.dump(self.rbm_biases, f) with open(self.outdir + "rbm_hidden.p", 'wb') as f: pickle.dump(self.rbm_h_act, f)
def restrictedBoltzmannMachine(trainData, trainLabels, testData): logistic = linear_model.LogisticRegression(solver='lbfgs', max_iter=10000, multi_class='multinomial') rbm = BernoulliRBM(random_state=0, batch_size = 2000, verbose=True) rbm_features_classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) # ############################################################################# # Training # Hyper-parameters. These were set by cross-validation, # using a GridSearchCV. Here we are not performing cross-validation to # save time. rbm.learning_rate = 0.06 rbm.n_iter = 20 # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = 100 logistic.C = 6000 # Training RBM-Logistic Pipeline rbm_features_classifier.fit(trainData, trainLabels) labels = rbm_features_classifier.predict(testData) #labels = list(labels) return labels '''
def fit(self, X, y=None): num = self.patch_num // X.size data = [] for item in X: img = imread(str(item[0])) img = img_as_ubyte(rgb2gray(img)) #img = self.binary(img) # 二值化 tmp = extract_patches_2d(img, self.patch_size, max_patches = num,\ random_state=np.random.RandomState()) data.append(tmp) data = np.vstack(data) data = data.reshape(data.shape[0], -1) data = np.asarray(data, 'float32') # 二值化后不需要0-1归化 data = data - np.min(data, 0) data = data/(np.max(data, 0) + 0.0001) # 0-1 scaling self.rbm = BernoulliRBM(n_components=self.n_components,\ learning_rate=self.learning_rate, \ n_iter=self.n_iter,\ batch_size=self.batch_size,\ verbose=True) self.rbm.fit(data) return self
def test_nn(folder='data_270_json'): all_data = put_together(folder) vec = DictVectorizer() all_detects_vec = vec.fit_transform(all_data['defects']) model = BernoulliRBM() model.fit(all_detects_vec) ready = [] for fn in os.listdir(folder): data = None fullname = os.path.join(folder, fn) if os.path.isfile(fullname): with open(fullname) as f: try: data = json.load(f) except: pass if data: fe = get_features(data) if len(fe['defects']) > 0: vec = vec.transform(fe['defects']) p = model.transform(vec) data['vd'] = p.tolist() r = {} r['vzw'] = data['vzw'] r['defects'] = p.tolist() r['measurement'] = fe['measurement'] ready.append(r)
def init_coefs_(X, y): model = BernoulliRBM(random_state=0, verbose=True, learning_rate=0.1, n_iter=20) model.fit(X, y) return model.intercept_visible_
def train_rbm_stack(data, network_structure, batch_size=10, learning_rate=0.1, n_iter=10, random_state=None, verbose=0): weights = [] visible_unit_samples = data for layer in network_structure: model = BernoulliRBM(n_components=layer, batch_size=batch_size, learning_rate=learning_rate, n_iter=n_iter, random_state=random_state, verbose=verbose) hidden_unit_samples = model.fit_transform(visible_unit_samples) weights.append(model.components_) visible_unit_samples = hidden_unit_samples return weights
def train_nn(data, expected_values): data, expected_values = preprocess_data(data, expected_values, remove_high_rr=False) logger.info("Starting feature reduction.") X = np.asarray(data[1:], 'float64') logger.info("Done with feature reduction.") Y = expected_values X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) logger.info("Starting NeuralNetwork training.") logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) clf = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) rbm.learning_rate = 0.06 rbm.n_iter = 20 rbm.n_components = 100 logistic.C = 1.0 clf.fit(X_train, Y_train) # Evaluation #TODO: Make unified evaluation logger.info("Logistic regression using RBM features:\n%s\n" % (metrics.classification_report(Y_test, clf.predict(X_test)))) logger.info("Done with NeuralNetwork training.") return lambda x: wrap_threshold_distribtuion( np.array(clf.predict(x)).astype(float))
def RBM_SVM(trainfeatures, testfeatures, trainlabels, testlabels): # ******************* Scikit-learning RBM + SVM ******************* print "train RBM+SVM model" ## trainfeatures = (trainfeatures - np.min(trainfeatures, 0)) / (np.max(trainfeatures, 0) + 0.0001) # 0-1 scaling min_max_scaler = preprocessing.MinMaxScaler() trainfeatures_fs = min_max_scaler.fit_transform(trainfeatures) testfeatures_fs = min_max_scaler.transform(testfeatures) # SVM parameters clf = svm.SVC(C=5.0, kernel='sigmoid', degree=3, gamma=0.5, coef0=10.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None) # RBM parameters rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.06 rbm.n_iter = 20 # Machine learning pipeline classifier = Pipeline(steps=[('rbm', rbm), ('svm', clf)]) # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = 400 classifier.fit(trainfeatures_fs, trainlabels) results = classifier.predict(testfeatures_fs) results = results.ravel() testerror = float(len(testlabels) - np.sum(testlabels == results))/float(len(testlabels)) # print"error rate with SVM is %.4f" %testerror return testerror
def SGD(): SGD = linear_model.SGDClassifier(loss='hinge',penalty='l2',random_state=42,n_jobs=-1,epsilon=0.001) rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('SGD', SGD)]) # RBM parameters obtained after cross-validation rbm.learning_rate = 0.01 rbm.n_iter = 15 rbm.n_components = 50 SGD.alpha=0.0001 SGD.C=1 # Training SGD SGD_classifier = linear_model.SGDClassifier(loss='hinge',penalty='l2',random_state=42,n_jobs=-1,alpha=0.0001, epsilon=0.001) SGD_classifier.fit(data_train,target_train) # Training RBM-SGD Pipeline classifier.fit(data_train,target_train) print("printing_results") print("SGD using RBM features:\n%s\n" % (metrics.classification_report(target_test,classifier.predict(data_test)))) cm = confusion_matrix(target_test,classifier.predict(data_test)) plt.matshow(cm) plt.title('Confusion Matrix SVM with SDG with RBM Features') plt.colorbar() plt.ylabel('True Label') plt.xlabel('Predicted Label') plt.savefig('confusion_matrix1.jpg') print("SGD using raw pixel features:\n%s\n" % (metrics.classification_report(target_test,SGD_classifier.predict(data_test)))) cm1 = confusion_matrix(target_test,SGD_classifier.predict(data_test)) plt.matshow(cm1) plt.title('Confusion Matrix SVM with SDG Raw Features') plt.colorbar() plt.ylabel('True Label') plt.xlabel('Predicted Label') plt.savefig('confusion_matrix2.jpg')
def train_rbm_pcd(x_train, x_val, n_hidden, lr, inftype, n_iter=1000): assert n_iter > 100 # we checkpoint every 100 iterations rbm = BernoulliRBM( n_components=n_hidden, learning_rate=lr, batch_size=x_train.shape[0], n_iter=n_iter, verbose=0, ) best_score, best_rbm = np.inf, None for it in range(n_iter): rbm.partial_fit(x_train) if (it + 1) % 20 == 0: # checkpoint every 20 score = test_rbm_pcd( x_val, rbm.components_, rbm.intercept_hidden_, rbm.intercept_visible_, inftype, ) if score < best_score: best_score = score best_rbm = ( rbm.components_.copy(), rbm.intercept_hidden_.copy(), rbm.intercept_visible_.copy(), ) return best_rbm, best_score
def build_classifier(clf_name): clf = None parameters = {} if clf_name == "svm": clf = svm.SVC(kernel='linear', C=10) parameters = {} elif clf_name == "knn": clf = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='brute', leaf_size=30, metric='cosine', metric_params=None) elif clf_name == "rmb": logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.01 rbm.n_iter = 20 rbm.n_components = 100 logistic.C = 6000 clf = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) #parameters = {'clf__C': (1, 10)} elif clf_name == "tsne": clf = TSNE(n_components=2, init='random', metric='cosine') return clf, parameters
def rbm_001(): s = 15 crop = 150 n_patches = 400000 rf_size = 5 train_x_crop_scale = CropScaleImageTransformer(training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() images = images.reshape((images.shape[0], 15 * 15 * 3)) # rbm needs inputs to be between 0 and 1 scaler = MinMaxScaler() images = scaler.fit_transform(images) # Training takes a long time, says 80 seconds per iteration, but seems like longer # And this is only with 256 components rbm = BernoulliRBM(verbose=1) rbm.fit(images) train_x = rbm.transform(images) train_y = classes.train_solutions.data # 0.138 CV on 50% of the dataset wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def SGD_cross_validation(): SGD = linear_model.SGDClassifier(loss='hinge', penalty='l2', random_state=42, n_jobs=-1, epsilon=0.001) # cross-validaiotn for SGD classifier rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('SGD', SGD)]) rbm.n_iter = 100 cv = cross_validation.StratifiedKFold(output, 3) score_func = metrics.f1_score parameters = { "rbm__learning_rate": [0.1, 0.01, 0.001, 0.0001], "rbm__n_components": [100, 200, 300, 400, 500, 600, 700, 800], "SGD__alpha": [0.1, 0.01, 0.001, 0.0001], "SGD__C": [1, 100, 1000, 10000] } grid_search = GridSearchCV(classifier, parameters, score_func=score_func, cv=cv) grid_search.fit(input, output) print "Best %s: %0.3f" % (score_func.__name__, grid_search.best_score_) print "Best parameters set:" best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print "\t%s: %r" % (param_name, best_parameters[param_name])
def test_rbm_pcd_gibbs(x_test, Whv, bh, bv, p_target=0.5, n_gibbs_steps=5000, thinning=10, burnin=20): rbm = BernoulliRBM(n_components=Whv.shape[0], learning_rate=0.0) rbm.components_, rbm.intercept_hidden_, rbm.intercept_visible_ = Whv, bh, bv evidence_mask = np.random.binomial( 1, p_target, x_test.shape) # 0: target node, 1: evidence node, V = np.random.binomial(1, p_target, x_test.shape) V = x_test * evidence_mask + V * (1 - evidence_mask) prob1 = np.zeros_like(V) count = 0 for it in range(n_gibbs_steps): V = rbm.gibbs(V) V = x_test * evidence_mask + V * (1 - evidence_mask) if (it + 1) % thinning == 0 and it > burnin: prob1 += V count += 1 prob1 /= count prob1_clipped = prob1.clip(1e-15, 1 - 1e-15) target_mask = 1 - evidence_mask logp = x_test * np.log(prob1_clipped) + ( 1 - x_test) * np.log(1 - prob1_clipped) logp *= target_mask return -logp.sum() / target_mask.sum() / np.log(2)
def train(self, train_set, train_labels): if self.supervised and train_labels is not None: return self._train_unsupervised_methods_per_class(train_set, train_labels) else: model = BernoulliRBM(**self.classifier_kwargs) model.fit(train_set) return model
def rbm(): X_train, Y_train, X_test, Y_test = train_test_data(is_feature=False) rbm = BernoulliRBM(random_state=0, verbose=True) logistic = linear_model.LogisticRegression(solver='newton-cg', tol=1) rbm_features_classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) rbm.learning_rate = 0.06 rbm.n_iter = 10 # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = 100 logistic.C = 50 X_train = X_train.reshape(X_train.shape[0], -1) # Training RBM-Logistic Pipeline rbm_features_classifier.fit(X_train, Y_train) # # Training the Logistic regression classifier directly on the pixel # raw_pixel_classifier = clone(logistic) # raw_pixel_classifier.C = 100. # raw_pixel_classifier.fit(X_train, Y_train) X_test = X_test.reshape(X_test.shape[0], -1) Y_pred = rbm_features_classifier.predict(X_test) # print("Logistic regression using RBM features:\n%s\n" % ( # metrics.classification_report(Y_test, Y_pred))) # Y_pred = raw_pixel_classifier.predict(X_test) result_analysis(Y_pred, Y_test, 'BernoulliRBM')
def process_machine_learning(symbol, i, path): params['path']= path label, feature= load_data(params['path']) #scales values in features so that they range from 0 to 1 minmaxScaler = MinMaxScaler() feature = minmaxScaler.fit_transform(feature) print("Dimensions") print("label", label.shape) print("feature", feature.shape) #feature selection using RBM start_time = time.time() rbm = BernoulliRBM(n_components=params['reduced_feature'], learning_rate=params['learning_rate'], batch_size=params['batchsize'], n_iter=params['n_iter']) feature = rbm.fit_transform(feature) print("RBM--- %s seconds ---" % (time.time() - start_time)) print("Dimensions after RBM") print("label", label.shape) print("feature", feature.shape) x_train, x_test, y_train, y_test = train_test_split(feature, label, i) y_pred = random_forest(x_train, x_test, y_train) signal_pd=pd.DataFrame({'y_test':y_test[:,0],'y_pred':y_pred}) signal_pd.to_csv(os.path.join('..', 'data', 'rbm_random_forest',symbol,symbol+'_'+str(i)+'.csv'))
def run_test(params, model): if model == "rf": n_tree, mtry = params print "# Trees: ", n_tree print "mtry: ", mtry rf = RandomForestClassifier(n_estimators= int(n_tree), verbose = True, n_jobs = -1, max_features= int(mtry)) rf.fit(X, y) modelPred = rf.predict(X) elif model == "svm": C, kernel = params print "# Cost: ", C print "kernel: ", kernel svmod = SVC(int(C), kernel) svmod.fit(X, y) modelPred = svmod.predict(X) elif model == "knn": k = params print "# k: ", k knnmod = KNeighborsClassifier(int(k)) knnmod.fit(X, y) modelPred =knnmod.predict(X) elif model == "NeuralNetwork": n_components, learning_rate, batch_size, n_iter = params print "# n_components: ", n_components print "# learning_rate: ", learning_rate print "# batch_size: ", batch_size print "# n_iter: ", n_iter nnmod = BernoulliRBM(int(n_components), learning_rate, int(batch_size), int(n_iter)) nnmod.fit(X, y) modelPred =nnmod.score_samples(X) accuError = AccuracyErrorCalc(y, modelPred) return accuError
def Logistic(): logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) # RBM parameters obtained after cross-validation rbm.learning_rate = 0.01 rbm.n_iter = 121 rbm.n_components = 700 logistic.C= 1.0 # Training RBM-Logistic Pipeline classifier.fit(data_train,target_train) # Training Logistic regression logistic_classifier = linear_model.LogisticRegression(C=1.0) logistic_classifier.fit(data_train,target_train) print("printing_results") print("Logistic regression using RBM features:\n%s\n" % (metrics.classification_report(target_test,classifier.predict(data_test)))) cm3 = confusion_matrix(target_test,classifier.predict(data_test)) plt.matshow(cm3) plt.title('Confusion Matrix Logistic Regression with RBM Features') plt.colorbar() plt.ylabel('True Label') plt.xlabel('Predicted Label') plt.savefig('confusion_matrix3.jpg') print("Logistic regression using raw pixel features:\n%s\n" % (metrics.classification_report(target_test,logistic_classifier.predict(data_test)))) cm4 = confusion_matrix(target_test,logistic_classifier.predict(data_test)) plt.matshow(cm4) plt.title('Confusion Matrix Logistic Regression') plt.colorbar() plt.ylabel('True Label') plt.xlabel('Predicted Label') plt.savefig('confusion_matrix4.jpg') #Logistic()
def words_to_vec(df): print("Method: words_to_vec. Working on words to vecs....") buzzCount = CountVectorizer(stop_words='english', max_features=50, ngram_range=(1, 1), token_pattern=u'.*_.*') buzzCount_te_sparse = buzzCount.fit_transform(df["buzzers"]) buzzTFid = TfidfVectorizer(stop_words='english', max_features=500, ngram_range=(2, 9)) buzzTFid_te_sparse = buzzTFid.fit_transform(df["description"]) _boltzman = BernoulliRBM(n_components=35) _boltzman.fit(buzzTFid_te_sparse) buzzTFid_boltzman = _boltzman.transform(buzzTFid_te_sparse) buzzCount_df = pd.DataFrame(buzzCount_te_sparse.toarray(), columns=buzzCount.get_feature_names()) buzzTFid_boltzman_cols = ['buzz_boltz_' + str(ag) for ag in range(1, buzzTFid_boltzman.shape[1] + 1)] buzzTFid_boltzman_df = pd.DataFrame(buzzTFid_boltzman, columns=buzzTFid_boltzman_cols) df = pd.concat([df, buzzCount_df, buzzTFid_boltzman_df], axis=1) #fagg = FeatureAgglomeration(n_clusters=100) #fagg.fit(buzzTFid_te_sparse.toarray()) #buzzTFid_fagg = fagg.transform(buzzTFid_te_sparse.toarray()) #buzzCount_df = pd.DataFrame(buzzCount_te_sparse.toarray(), columns=buzzCount.get_feature_names()) #buzzTFid_fagg_cols = ['buzz_fagg' + str(ag) for ag in range(1, buzzTFid_fagg.shape[1] + 1)] #buzzTFid_fagg_df = pd.DataFrame(buzzTFid_fagg, columns=buzzTFid_fagg_cols) #df = pd.concat([df, buzzTFid_fagg_df], axis=1) print("Method: words_to_vec. Returning words to vecs....") return df
def fit(self, X, y): self.rbm_1 = BernoulliRBM(verbose=self.verbose, n_components=self.n_components_first, n_iter=self.n_iter_first, learning_rate=self.learning_rate_first) self.rbm_2 = BernoulliRBM(verbose=self.verbose, n_components=self.n_components_second, n_iter=self.n_iter_second, learning_rate=self.learning_rate_second) self.first_pipeline = Pipeline( steps=[('rbm_1', self.rbm_1), ('rbm_2', self.rbm_2)]) self.first_pipeline.fit(X, y) # TODO improve. Look at how it is done in classify new_features = [] for example, label in zip(X, y): transformed = self.first_pipeline.transform(example)[0] new_features.append( np.concatenate((transformed, self.label_to_feature(label)))) self.rbm_3 = BernoulliRBM(verbose=self.verbose, n_components=self.n_components_third, n_iter=self.n_iter_third, learning_rate=self.learning_rate_third) self.rbm_3.fit(new_features, y)
def combine(data): # unpack data (numerical, categorical, other, nan) = data # create numlog (add a little bit to prevent values <= 0) numlog = np.log(numerical + 0.01) numlog = (numlog - numlog.mean()) / (numlog.max() - numlog.min()) numlog = numlog.fillna(0) # normalize and impute numerical numerical = (numerical - numerical.mean()) / (numerical.max() - numerical.min()) numerical = numerical.fillna(0) # RBM categorical rbmcat = pd.get_dummies(categorical) # RBM other rbmother = pd.get_dummies(pd.DataFrame(splitcomplex(np.array(other)))) # factorize categorical for column in categorical: categorical[column], _ = pd.factorize(categorical[column]) categorical = (categorical - categorical.mean()) / (categorical.max() - categorical.min()) # factorize other for column in other: other[column], _ = pd.factorize(other[column]) other = (other - other.mean()) / (other.max() - other.min()) ### CONVERT TO NUMPY ### numerical = np.array(numerical) numlog = np.array(numlog) categorical = np.array(categorical) rbmcat = np.array(rbmcat) other = np.array(other) rbmother = np.array(rbmother) nan = np.array(nan) ######################## # rbm over rbmcat and rbmother rbm = BernoulliRBM(n_components=100, batch_size=100, n_iter=50, learning_rate=0.02, verbose=1, random_state=1) rbmdata = rbm.fit_transform(np.concatenate((rbmcat, rbmother), axis=1)) rbmdata = (rbmdata - rbmdata.mean()) / (rbmdata.max() - rbmdata.min()) # normalize nan nan = (nan - nan.mean()) / (nan.max() - nan.min()) # concat and return data = np.concatenate( (numerical, numlog, categorical, other, rbmdata, nan), axis=1) return data
def neural_network_classify(train_data,train_label,test_data): nnc=BernoulliRBM(random_state=0, verbose=True) nnc.fit(train_data, ravel(train_label)) test_label=ncc.predict(test_data) save_result(test_label,'sklearn_neural_network_classify_Result.csv') return test_label
def build_model(training_data): """ build and train the rbm. """ rbm = BernoulliRBM(random_state=0, verbose=True, n_components=100, n_iter=50) rbm.fit(training_data) return rbm
def boltzmann_machine(train_matrix, n_comp, learning_rate=0.06, n_iter=20): from sklearn.neural_network import BernoulliRBM rbm = BernoulliRBM(n_components=n_com, learning_rate=learning_rate, n_iter=n_iter) rbm_transformed = rbm.fit_transform(train_matrix) print("successful RBM transform", rbm_transformed.shape) return rbm_transformed
def neural_network_classify(train_data,train_label,test_data): # nnc=MLPClassifier(algorithm='l-bfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) nnc=BernoulliRBM(random_state=0, verbose=True) nnc.fit(train_data, ravel(train_label)) test_label=ncc.predict(test_data) save_result(test_label,'sklearn_neural_network_classify_Result.csv') return test_label
def test_rbm_verbose(): rbm = BernoulliRBM(n_iter=2, verbose=10) old_stdout = sys.stdout sys.stdout = StringIO() try: rbm.fit(Xdigits) finally: sys.stdout = old_stdout
def _after_publish(self, documents): self._tfidf_matrix = self._vectorizer.fit_transform(documents) self._bernoulliRBM = BernoulliRBM(learning_rate=1) self._rbm_matrix = self._bernoulliRBM.fit_transform(self._tfidf_matrix) self._bernoulliRBM_index = NearestNeighbors(len(self._service_array), algorithm='brute', metric='euclidean') self._bernoulliRBM_index.fit(self._rbm_matrix)
def Bernoulli(X_train, X_test, y_train, y_test): mod = BernoulliRBM(random_state=0, verbose=True) mod.fit(X_train, y_train) print "Done training" bernoulli_labels = mod.predict(X_test) print "Done testing" bernoulli_score = mod.score(X_test, y_test) return bernoulli_score, bernoulli_labels
def runRBM(arr, clsfr):#iters, lrn_rate, logistic_c_val, logistic_c_val2, n_comp, filename): global file_dir, nEvents, solutionFile iters = int(arr[0]*10) lrn_rate = arr[1] logistic_c_val = arr[2]*1000.0 logistic_c_val2 = arr[3]*100.0 n_comp = int(arr[4]*100) filename = 'rbm_iter'+str(iters)+'_logc'+str(log_c_val)+'_logcc'+str(log_c_val2)+'_lrn'+str(learn_rate)+'_nc'+str(n_comp)# low logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) ############################################################################### # Training # Hyper-parameters. These were set by cross-validation, # using a GridSearchCV. Here we are not performing cross-validation to # save time. rbm.learning_rate = lrn_rate #0.10#0.06 rbm.n_iter = iters #20 # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = n_comp # 250 logistic.C = logistic_c_val #6000.0 # Training RBM-Logistic Pipeline classifier.fit(sigtr[train_input].values, sigtr['Label'].values) # Training Logistic regression logistic_classifier = linear_model.LogisticRegression(C=logistic_c_val2)#100.0 logistic_classifier.fit(sigtr[train_input].values, sigtr['Label'].values) ############################################################################### # Evaluation if clsfr == 0: clsnn_pred=classifier.predict(sigtest[train_input].values) solnFile('clsnn_'+filename,clsnn_pred,sigtest['EventId'].values)#,bkgtest) ams_score = ams.AMS_metric(solutionFile, file_dir+filename+'.out', nEvents) print ams_score logfile.write(filename+': ' + str(ams_score)+'\n') elif clsfr == 1: log_cls_pred = logistic_classifier.predict(sigtest[train_input].values) solnFile('lognn_'+filename,log_cls_pred,sigtest['EventId'].values)#,bkgtest) ams_score = ams.AMS_metric(solutionFile, file_dir+'lognn_'+filename+'.out', nEvents) print ams_score logfile.write('lognn ' + filename+': ' + str(ams_score)+'\n') else: logistic_classifier_tx = linear_model.LogisticRegression(C=logistic_c_val2) logistic_classifier_tx.fit_transform(sigtr[train_input].values, sigtr['Label'].values) log_cls_tx_pred = logistic_classifier_tx.predict(sigtest[train_input].values) solnFile('lognntx_'+filename,log_cls_tx_pred,sigtest['EventId'].values)#,bkgtest) ams_score = ams.AMS_metric(solutionFile, file_dir+filename+'.out', nEvents) print ams_score logfile.write('lognntx '+ filename+': ' + str(ams_score)+'\n') return -1.0*float(ams_score)
def transform(self, X): brbm = BernoulliRBM(n_components=256, learning_rate=0.1, batch_size=10, n_iter=10, verbose=0, random_state=None) return pd.DataFrame(brbm.fit_transform(X))
def test_feature_names_out(method): """Check `get_feature_names_out` for `BernoulliRBM`.""" n_components = 10 rbm = BernoulliRBM(n_components=n_components) getattr(rbm, method)(Xdigits) names = rbm.get_feature_names_out() expected_names = [f"bernoullirbm{i}" for i in range(n_components)] assert_array_equal(expected_names, names)
def test_transformer_dtypes_casting(dtype_in, dtype_out): X = Xdigits[:100].astype(dtype_in) rbm = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42) Xt = rbm.fit_transform(X) # dtype_in and dtype_out should be consistent assert Xt.dtype == dtype_out, "transform dtype: {} - original dtype: {}".format( Xt.dtype, X.dtype )
def test_transform(): X = Xdigits[:100] rbm1 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42) rbm1.fit(X) Xt1 = rbm1.transform(X) Xt2 = rbm1._mean_hiddens(X) assert_array_equal(Xt1, Xt2)
def BernoulliRBM_classifier(best_parameters={}): from sklearn.neural_network import BernoulliRBM if len(best_parameters) > 0: model = BernoulliRBM(n_components=best_parameters['Model__n_components'], learning_rate=best_parameters['Model__learning_rate'], batch_size=best_parameters['Model__batch_size'], n_iter=best_parameters['Model__n_iter'], verbose=best_parameters['Model__verbose'], random_state=best_parameters['Model__random_state']) else: model = BernoulliRBM() return model
class DeepRbmMnistClassifier: def __init__(self): self.n_components_first = 500 self.n_components_second = 500 self.n_components_third = 2000 self.n_iter_first = 20 self.n_iter_second = 20 self.n_iter_third = 20 self.learning_rate_first = 0.06 self.learning_rate_second = 0.06 self.learning_rate_third = 0.06 self.verbose = True def label_to_feature(self, y): feature = [0] * 10 feature[y] = 1 return feature def fit(self, X, y): self.rbm_1 = BernoulliRBM(verbose=self.verbose, n_components=self.n_components_first, n_iter=self.n_iter_first, learning_rate=self.learning_rate_first) self.rbm_2 = BernoulliRBM(verbose=self.verbose, n_components=self.n_components_second, n_iter=self.n_iter_second, learning_rate=self.learning_rate_second) self.first_pipeline = Pipeline( steps=[('rbm_1', self.rbm_1), ('rbm_2', self.rbm_2)]) self.first_pipeline.fit(X, y) # TODO improve. Look at how it is done in classify new_features = [] for example, label in zip(X, y): transformed = self.first_pipeline.transform(example)[0] new_features.append( np.concatenate((transformed, self.label_to_feature(label)))) self.rbm_3 = BernoulliRBM(verbose=self.verbose, n_components=self.n_components_third, n_iter=self.n_iter_third, learning_rate=self.learning_rate_third) self.rbm_3.fit(new_features, y) def classify(self, X): transformed = self.first_pipeline.transform(X) transformed = np.concatenate( (transformed, [[0] * 10] * len(transformed)), axis=1) # The inverse of rbm_3 to go from hidden layer to visible layer rbm_aux = BernoulliRBM() rbm_aux.intercept_hidden_ = self.rbm_3.intercept_visible_ rbm_aux.intercept_visible_ = self.rbm_3.intercept_hidden_ rbm_aux.components_ = np.transpose(self.rbm_3.components_) results = rbm_aux.transform(self.rbm_3.transform(transformed)) results = results[:, -10:] return np.argmax(results, axis=1)
def test_gibbs_smoke(): """ just seek if we don't get NaNs sampling the full digits dataset """ rng = np.random.RandomState(42) X = Xdigits rbm1 = BernoulliRBM(n_components=42, batch_size=10, n_iter=20, random_state=rng) rbm1.fit(X) X_sampled = rbm1.gibbs(X) assert_all_finite(X_sampled)
def test_fit(): X = Xdigits.copy() rbm = BernoulliRBM(n_components=64, learning_rate=0.1, batch_size=10, n_iter=7, random_state=9) rbm.fit(X) assert_almost_equal(rbm.score_samples(X).mean(), -21.0, decimal=0) # in-place tricks shouldn't have modified X assert_array_equal(X, Xdigits)
def brbm_rf(Xtr, ytr, Xte=None, yte=None): randomforest = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=100) rbm = BernoulliRBM(random_state=0) classifier = Pipeline(steps=[('rbm', rbm), ('randomforest', randomforest)]) rbm.learning_rate = 0.025 rbm.n_iter = 250 rbm.n_components = 100 return simple_classification(classifier, Xtr, ytr, Xte, yte)
def test_sample_hiddens(): rng = np.random.RandomState(0) X = Xdigits[:100] rbm1 = BernoulliRBM(n_components=2, batch_size=5, n_iter=5, random_state=42) rbm1.fit(X) h = rbm1._mean_hiddens(X[0]) hs = np.mean([rbm1._sample_hiddens(X[0], rng) for i in range(100)], 0) assert_almost_equal(h, hs, decimal=1)
def rbm_knn_train_and_predict(train_set_x,train_set_y,test_set_x,test_set_y): knn = KNeighborsClassifier(n_neighbors=5) rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.06 rbm.n_iter = 20 rbm.n_components = 100 classifier = Pipeline(steps=[('rbm', rbm), ('knn', knn)]) classifier.fit(train_set_x,train_set_y) PRED = classifier.predict(test_set_x) return PRED
def rbm_dbn_train_and_predict(train_set_x,train_set_y,test_set_x,test_set_y): dbn = DBN(epochs=200,learn_rates=0.01) rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.06 rbm.n_iter = 20 rbm.n_components = 100 classifier = Pipeline(steps=[('rbm', rbm), ('dbn', dbn)]) classifier.fit(train_set_x,train_set_y) PRED = classifier.predict(test_set_x) return PRED
def rbm_logistic_train_and_predict(train_set_x,train_set_y,test_set_x,test_set_y): logistic = linear_model.LogisticRegression(C=6000) rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.06 rbm.n_iter = 20 rbm.n_components = 100 classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) classifier.fit(train_set_x,train_set_y) PRED = classifier.predict(test_set_x) return PRED
def test_gibbs_smoke(): """Check if we don't get NaNs sampling the full digits dataset. Also check that sampling again will yield different results.""" X = Xdigits rbm1 = BernoulliRBM(n_components=42, batch_size=40, n_iter=20, random_state=42) rbm1.fit(X) X_sampled = rbm1.gibbs(X) assert_all_finite(X_sampled) X_sampled2 = rbm1.gibbs(X) assert_true(np.all((X_sampled != X_sampled2).max(axis=1)))
class BernoulliRBMSearchEngine(SmartSearchEngine): # # Registry implementation using ball-tree def __init__(self): super(BernoulliRBMSearchEngine, self).__init__() self._service_array = [] self._bernoulliRBM_index = None self._tfidf_matrix = None def load_configuration(self, configuration_file): super(BernoulliRBMSearchEngine, self).load_configuration(configuration_file) self._vectorizer = TfidfVectorizer( sublinear_tf=False, analyzer='word', lowercase=False, use_bm25idf=self._use_bm25idf, bm25_tf=self._use_bm25tf, k=self._bm25_k, preprocessor=StringPreprocessorAdapter()) def unpublish(self, service): pass def _preprocess(self, bag_of_words): return bag_of_words.get_words_str() def _after_publish(self, documents): self._tfidf_matrix = self._vectorizer.fit_transform(documents) self._bernoulliRBM = BernoulliRBM(learning_rate=1) self._rbm_matrix = self._bernoulliRBM.fit_transform(self._tfidf_matrix) self._bernoulliRBM_index = NearestNeighbors(len(self._service_array), algorithm='brute', metric='euclidean') self._bernoulliRBM_index.fit(self._rbm_matrix) def publish(self, service): pass def find(self, query): query = StringTransformer().transform(query) query_array = self._vectorizer.transform( [self._query_transformer.transform(query).get_words_str()]) query_array = self._bernoulliRBM.transform(query_array.toarray()) result = self._bernoulliRBM_index.kneighbors(query_array, return_distance=False)[0] result_list = [] for index in result: result_list.append(self._service_array[index]) return result_list def number_of_services(self): pass
class DeepRbmMnistClassifier: def __init__(self): self.n_components_first = 500 self.n_components_second = 500 self.n_components_third = 2000 self.n_iter_first = 20 self.n_iter_second = 20 self.n_iter_third = 20 self.learning_rate_first = 0.06 self.learning_rate_second = 0.06 self.learning_rate_third = 0.06 self.verbose = True def label_to_feature(self,y): feature = [0]*10 feature[y] = 1 return feature def fit(self,X,y): self.rbm_1 = BernoulliRBM(verbose=self.verbose, n_components=self.n_components_first, n_iter=self.n_iter_first, learning_rate=self.learning_rate_first) self.rbm_2 = BernoulliRBM(verbose=self.verbose, n_components=self.n_components_second, n_iter=self.n_iter_second, learning_rate=self.learning_rate_second) self.first_pipeline = Pipeline(steps=[('rbm_1',self.rbm_1), ('rbm_2',self.rbm_2)]) self.first_pipeline.fit(X,y) # TODO improve. Look at how it is done in classify new_features = [] for example,label in zip(X,y): transformed = self.first_pipeline.transform(example)[0] new_features.append(np.concatenate((transformed,self.label_to_feature(label)))) self.rbm_3 = BernoulliRBM(verbose=self.verbose, n_components=self.n_components_third, n_iter=self.n_iter_third, learning_rate=self.learning_rate_third) self.rbm_3.fit(new_features,y) def classify(self,X): transformed = self.first_pipeline.transform(X) transformed = np.concatenate((transformed,[[0]*10]*len(transformed)),axis=1) # The inverse of rbm_3 to go from hidden layer to visible layer rbm_aux = BernoulliRBM() rbm_aux.intercept_hidden_ = self.rbm_3.intercept_visible_ rbm_aux.intercept_visible_ = self.rbm_3.intercept_hidden_ rbm_aux.components_ = np.transpose(self.rbm_3.components_) results = rbm_aux.transform(self.rbm_3.transform(transformed)) results = results[:,-10:] return np.argmax(results,axis=1)
def test_fit_transform(): """Check proper implementation of fit_transform""" X = Xdigits[:100] rbm1 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42) rbm2 = clone(rbm1) Xt1 = rbm1.fit(X).transform(X) Xt2 = rbm2.fit_transform(X) assert_array_equal(Xt1, Xt2)
def test_score_samples(): """Check that the pseudo likelihood is computed without clipping. http://fa.bianp.net/blog/2013/numerical-optimizers-for-logistic-regression/ """ rng = np.random.RandomState(42) X = np.vstack([np.zeros(1000), np.ones(1000)]) rbm1 = BernoulliRBM(n_components=10, batch_size=2, n_iter=10, random_state=rng) rbm1.fit(X) assert((rbm1.score_samples(X) < -300).all())
def classify(self,X): transformed = self.first_pipeline.transform(X) transformed = np.concatenate((transformed,[[0]*10]*len(transformed)),axis=1) # The inverse of rbm_3 to go from hidden layer to visible layer rbm_aux = BernoulliRBM() rbm_aux.intercept_hidden_ = self.rbm_3.intercept_visible_ rbm_aux.intercept_visible_ = self.rbm_3.intercept_hidden_ rbm_aux.components_ = np.transpose(self.rbm_3.components_) results = rbm_aux.transform(self.rbm_3.transform(transformed)) results = results[:,-10:] return np.argmax(results,axis=1)
def test_fit_gibbs(): # Gibbs on the RBM hidden layer should be able to recreate [[0], [1]] # from the same input rng = np.random.RandomState(42) X = np.array([[0.], [1.]]) rbm1 = BernoulliRBM(n_components=2, batch_size=2, n_iter=42, random_state=rng) # you need that much iters rbm1.fit(X) assert_almost_equal(rbm1.components_, np.array([[0.02649814], [0.02009084]]), decimal=4) assert_almost_equal(rbm1.gibbs(X), X) return rbm1
def bernoulli_rbm(data, labels): print '> running rbm' print 'visible units: %d' % len(data) print 'hidden units: %d' % hidden_units print 'epochs size: %d' % epochs_size print '-------------' rbm = BernoulliRBM(batch_size=32, learning_rate=0.1, n_components=5, n_iter=10, random_state=numpy.RandomState, verbose=True) rbm.fit(data, labels) training_data = np.array(data) rbm.train(training_data, epochs_size, True)
def getNeuralModel(self,X,Y): logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(verbose=True) classifier = linear_model.LogisticRegression(penalty='l2', tol=.0001)#Pipeline(steps = [('rbm', rbm),('logistic',logistic)]) rbm.learning_rate = 0.0001 rbm.n_iter = 1000 rbm.n_components = 1000 classifier.fit(X, Y) return classifier
def test_partial_fit(): X = Xdigits.copy() rbm = BernoulliRBM(n_components=64, learning_rate=0.1, batch_size=20, random_state=9) n_samples = X.shape[0] n_batches = int(np.ceil(float(n_samples) / rbm.batch_size)) batch_slices = np.array_split(X, n_batches) for i in range(7): for batch in batch_slices: rbm.partial_fit(batch) assert_almost_equal(rbm.score_samples(X).mean(), -21.0, decimal=0) assert_array_equal(X, Xdigits)
def test_fit_gibbs_sparse(): # Gibbs on the RBM hidden layer should be able to recreate [[0], [1]] from # the same input even when the input is sparse, and test against non-sparse rbm1 = test_fit_gibbs() rng = np.random.RandomState(42) from scipy.sparse import csc_matrix X = csc_matrix([[0.], [1.]]) rbm2 = BernoulliRBM(n_components=2, batch_size=2, n_iter=42, random_state=rng) rbm2.fit(X) assert_almost_equal(rbm2.components_, np.array([[0.02649814], [0.02009084]]), decimal=4) assert_almost_equal(rbm2.gibbs(X), X.toarray()) assert_almost_equal(rbm1.components_, rbm2.components_)
def _RBM(self, X, y): from sklearn.neural_network import BernoulliRBM # PCA model creation, number of components # feature extraction method. Used here (after sampling) because we are # creating an universal model and not this_dataset-specific. neural_network = BernoulliRBM(n_components=self.k_features) neural_network.fit(X, y) X = neural_network.transform(X) self.feature_reduction_method = neural_network return X