def test_convergence_dtype_consistency(): # float 64 transformer X_64 = Xdigits[:100].astype(np.float64) rbm_64 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42) Xt_64 = rbm_64.fit_transform(X_64) # float 32 transformer X_32 = Xdigits[:100].astype(np.float32) rbm_32 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42) Xt_32 = rbm_32.fit_transform(X_32) # results and attributes should be close enough in 32 bit and 64 bit assert_allclose(Xt_64, Xt_32, rtol=1e-06, atol=0) assert_allclose(rbm_64.intercept_hidden_, rbm_32.intercept_hidden_, rtol=1e-06, atol=0) assert_allclose(rbm_64.intercept_visible_, rbm_32.intercept_visible_, rtol=1e-05, atol=0) assert_allclose(rbm_64.components_, rbm_32.components_, rtol=1e-03, atol=0) assert_allclose(rbm_64.h_samples_, rbm_32.h_samples_)
def process_machine_learning(symbol, i, path): params['path']= path label, feature= load_data(params['path']) #scales values in features so that they range from 0 to 1 minmaxScaler = MinMaxScaler() feature = minmaxScaler.fit_transform(feature) print("Dimensions") print("label", label.shape) print("feature", feature.shape) #feature selection using RBM start_time = time.time() rbm = BernoulliRBM(n_components=params['reduced_feature'], learning_rate=params['learning_rate'], batch_size=params['batchsize'], n_iter=params['n_iter']) feature = rbm.fit_transform(feature) print("RBM--- %s seconds ---" % (time.time() - start_time)) print("Dimensions after RBM") print("label", label.shape) print("feature", feature.shape) x_train, x_test, y_train, y_test = train_test_split(feature, label, i) y_pred = random_forest(x_train, x_test, y_train) signal_pd=pd.DataFrame({'y_test':y_test[:,0],'y_pred':y_pred}) signal_pd.to_csv(os.path.join('..', 'data', 'rbm_random_forest',symbol,symbol+'_'+str(i)+'.csv'))
def train_rbm_stack(data, network_structure, batch_size=10, learning_rate=0.1, n_iter=10, random_state=None, verbose=0): weights = [] visible_unit_samples = data for layer in network_structure: model = BernoulliRBM(n_components=layer, batch_size=batch_size, learning_rate=learning_rate, n_iter=n_iter, random_state=random_state, verbose=verbose) hidden_unit_samples = model.fit_transform(visible_unit_samples) weights.append(model.components_) visible_unit_samples = hidden_unit_samples return weights
def combine(data): # unpack data (numerical, categorical, other, nan) = data # create numlog (add a little bit to prevent values <= 0) numlog = np.log(numerical + 0.01) numlog = (numlog - numlog.mean()) / (numlog.max() - numlog.min()) numlog = numlog.fillna(0) # normalize and impute numerical numerical = (numerical - numerical.mean()) / (numerical.max() - numerical.min()) numerical = numerical.fillna(0) # RBM categorical rbmcat = pd.get_dummies(categorical) # RBM other rbmother = pd.get_dummies(pd.DataFrame(splitcomplex(np.array(other)))) # factorize categorical for column in categorical: categorical[column], _ = pd.factorize(categorical[column]) categorical = (categorical - categorical.mean()) / (categorical.max() - categorical.min()) # factorize other for column in other: other[column], _ = pd.factorize(other[column]) other = (other - other.mean()) / (other.max() - other.min()) ### CONVERT TO NUMPY ### numerical = np.array(numerical) numlog = np.array(numlog) categorical = np.array(categorical) rbmcat = np.array(rbmcat) other = np.array(other) rbmother = np.array(rbmother) nan = np.array(nan) ######################## # rbm over rbmcat and rbmother rbm = BernoulliRBM(n_components=100, batch_size=100, n_iter=50, learning_rate=0.02, verbose=1, random_state=1) rbmdata = rbm.fit_transform(np.concatenate((rbmcat, rbmother), axis=1)) rbmdata = (rbmdata - rbmdata.mean()) / (rbmdata.max() - rbmdata.min()) # normalize nan nan = (nan - nan.mean()) / (nan.max() - nan.min()) # concat and return data = np.concatenate( (numerical, numlog, categorical, other, rbmdata, nan), axis=1) return data
def boltzmann_machine(train_matrix, n_comp, learning_rate=0.06, n_iter=20): from sklearn.neural_network import BernoulliRBM rbm = BernoulliRBM(n_components=n_com, learning_rate=learning_rate, n_iter=n_iter) rbm_transformed = rbm.fit_transform(train_matrix) print("successful RBM transform", rbm_transformed.shape) return rbm_transformed
def test_transformer_dtypes_casting(dtype_in, dtype_out): X = Xdigits[:100].astype(dtype_in) rbm = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42) Xt = rbm.fit_transform(X) # dtype_in and dtype_out should be consistent assert Xt.dtype == dtype_out, "transform dtype: {} - original dtype: {}".format( Xt.dtype, X.dtype )
def transform(self, X): brbm = BernoulliRBM(n_components=256, learning_rate=0.1, batch_size=10, n_iter=10, verbose=0, random_state=None) return pd.DataFrame(brbm.fit_transform(X))
class BernoulliRBMSearchEngine(SmartSearchEngine): # # Registry implementation using ball-tree def __init__(self): super(BernoulliRBMSearchEngine, self).__init__() self._service_array = [] self._bernoulliRBM_index = None self._tfidf_matrix = None def load_configuration(self, configuration_file): super(BernoulliRBMSearchEngine, self).load_configuration(configuration_file) self._vectorizer = TfidfVectorizer( sublinear_tf=False, analyzer='word', lowercase=False, use_bm25idf=self._use_bm25idf, bm25_tf=self._use_bm25tf, k=self._bm25_k, preprocessor=StringPreprocessorAdapter()) def unpublish(self, service): pass def _preprocess(self, bag_of_words): return bag_of_words.get_words_str() def _after_publish(self, documents): self._tfidf_matrix = self._vectorizer.fit_transform(documents) self._bernoulliRBM = BernoulliRBM(learning_rate=1) self._rbm_matrix = self._bernoulliRBM.fit_transform(self._tfidf_matrix) self._bernoulliRBM_index = NearestNeighbors(len(self._service_array), algorithm='brute', metric='euclidean') self._bernoulliRBM_index.fit(self._rbm_matrix) def publish(self, service): pass def find(self, query): query = StringTransformer().transform(query) query_array = self._vectorizer.transform( [self._query_transformer.transform(query).get_words_str()]) query_array = self._bernoulliRBM.transform(query_array.toarray()) result = self._bernoulliRBM_index.kneighbors(query_array, return_distance=False)[0] result_list = [] for index in result: result_list.append(self._service_array[index]) return result_list def number_of_services(self): pass
def scale(params): xTrain = params[0] yTrain = params[1] xValid = params[2] print 'neuralizing' global neural neural = Bernoulli() xTrain = neural.fit_transform(xTrain, yTrain) xValid = neural.transform(xValid) return [xTrain, yTrain, xValid]
def RBM_new(): ''' RBM to evaluate on artificially generated data ''' #Define datasize (val_length, test_length) = (11710, 105384) (train_half_length, test_half_length) = (int(val_length/2), int(test_length/2)) #Generate artificial Z print('\n##############VALIDATION DATA#################') (TL_val, Z_val) = Z_new_generator(val_length, classifier_stats_val) print('\n##############TEST DATA#################') (TL_test, Z_test) = Z_new_generator(test_length, classifier_stats_test) #Convert '-1's to '0' Z_val[Z_val==-1]=0 Z_test[Z_test==-1]=0 #Train RBM rbm = BernoulliRBM(n_components = N_COMPONENTS, n_iter=N_ITERATIONS, learning_rate=LEARNING_RATE, batch_size=BATCH_SIZE) print(f'\nStarting RBM training.... {datetime.datetime.now().time()}') Z_val_probability = rbm.fit_transform(Z_val) Z_test_probability = rbm.transform(Z_test) print(f'\nRBM complete.... - {datetime.datetime.now().time()}') #Convert probability to values Z_val_final = np.sign(Z_val_probability - T1) Z_test_final = np.sign(Z_test_probability - T1) #RBM on validation data print(f'\n\n****VALIDATION RBM RESULTS*****') true_positives = sum(Z_val_final[train_half_length:]==-1) false_negatives = sum(Z_val_final[train_half_length:]==1) false_positives = sum(Z_val_final[0:train_half_length]==-1) true_negatives = sum(Z_val_final[0:train_half_length]==1) MLmodel_evaluation(true_positives, false_positives, false_negatives, true_negatives) #RBM on test data print(f'\n\n****TEST RBM RESULTS*****') true_positives = sum(Z_test_final[test_half_length:]==-1) false_negatives = sum(Z_test_final[test_half_length:]==1) false_positives = sum(Z_test_final[0:test_half_length]==-1) true_negatives = sum(Z_test_final[0:test_half_length]==1) MLmodel_evaluation(true_positives, false_positives, false_negatives, true_negatives)
def run(): train= trainingImage() trainTarget = trainingImageTarget() rbm = BernoulliRBM(n_components=numComponents, learning_rate=eta, batch_size=numBatch ,random_state=0, verbose= True) errors = np.empty(numEpochs) for epoch in range(numEpochs): transformedTrain = rbm.fit_transform(train, trainTarget) reconstructedTrain = transformedTrain.dot(rbm.components_) errors[epoch] = np.sum(np.abs(reconstructedTrain - train)) digts , digitAxis = plt.subplots(2, len(classes)) for index in range(len(classes)): imageIndex = classes[index] original = np.copy(train[imageIndex].reshape(28,28)) reconstructed= np.copy(reconstructedTrain[imageIndex].reshape(28,28)) digitAxis[0, index].imshow(original, extent=(0, 28, 0, 28) , aspect = 'auto') digitAxis[1, index].imshow(reconstructed, extent=(0, 28, 0, 28) , aspect = 'auto' ) plt.show()
class RBMClassifier: def __repr__(self): return 'RBMClassifier' def __init__(self, random_state=42, class_weight='balanced', verbose=True): self.rbm = BernoulliRBM(random_state=random_state, verbose=verbose) self.lr = LogisticRegression(class_weight=class_weight) def fit(self, X_train, y_train, X_valid, y_valid): self.data = self.rbm.fit_transform( pd.concat([pd.DataFrame(X_train), pd.DataFrame(X_valid)])) self.train = self.data[:len(X_train)] self.fit_lr = self.lr.fit(self.train, y_train) self.predict = self.data[len(X_train):] return self def predict_proba(self, Z): return self.fit_lr.predict_proba(self.predict)
def process_machine_learning(): i_label, i_feature, i_symbol = load_data(params['path']) i_pos = i_symbol.rfind('/') + 1 i_symbol = i_symbol[i_pos:i_pos + 2] #scales values in features so that they range from 0 to 1 i_minmaxScaler = MinMaxScaler() i_feature = i_minmaxScaler.fit_transform(i_feature) print("Dimensions") print("label", i_label.shape) print("feature", i_feature.shape) #feature selection using RBM i_start_time = time.time() i_rbm = BernoulliRBM(n_components=params['reduced_feature'], learning_rate=params['learning_rate'], batch_size=params['batchsize'], n_iter=params['n_iter']) i_feature = i_rbm.fit_transform(i_feature) print("RBM--- %s seconds ---" % (time.time() - i_start_time)) print("Dimensions after RBM") print("label", i_label.shape) print("feature", i_feature.shape) i_x_train, i_x_test, i_y_train, i_y_test = train_test_split( i_feature, i_label) i_y_pred = random_forest(i_x_train, i_x_test, i_y_train) i_filename = 'PRED_' + i_symbol + '-5.csv' with open(i_filename, 'wb') as csvfile: i_writer = csv.writer(csvfile, delimiter=',') for i in range(len(i_y_pred)): i_writer.writerow((i_y_pred[i], i_y_test[i][0])) print_f1_score(i_y_test, i_y_pred) classification_error(i_y_test, i_y_pred)
def process_machine_learning(): i_label, i_feature, i_symbol = load_data(params['path']) i_pos = i_symbol.rfind('/')+1 i_symbol = i_symbol[i_pos:i_pos+2] #scales values in features so that they range from 0 to 1 i_minmaxScaler = MinMaxScaler() i_feature = i_minmaxScaler.fit_transform(i_feature) print("Dimensions") print("label", i_label.shape) print("feature", i_feature.shape) #feature selection using RBM i_start_time = time.time() i_rbm = BernoulliRBM(n_components=params['reduced_feature'], learning_rate=params['learning_rate'], batch_size=params['batchsize'], n_iter=params['n_iter']) i_feature = i_rbm.fit_transform(i_feature) print("RBM--- %s seconds ---" % (time.time() - i_start_time)) print("Dimensions after RBM") print("label", i_label.shape) print("feature", i_feature.shape) i_x_train, i_x_test, i_y_train, i_y_test = train_test_split(i_feature, i_label) i_y_pred = random_forest(i_x_train, i_x_test, i_y_train) i_filename = 'PRED_'+i_symbol+'-5.csv' with open(i_filename, 'wb') as csvfile: i_writer = csv.writer(csvfile, delimiter=',') for i in range(len(i_y_pred)): i_writer.writerow((i_y_pred[i], i_y_test[i][0])) print_f1_score(i_y_test, i_y_pred) classification_error(i_y_test, i_y_pred)
from sklearn.cross_validation import cross_val_score from sklearn.grid_search import GridSearchCV from sklearn.neural_network import BernoulliRBM import pandas from massage import Massager import numpy as np train = pandas.read_csv("train.csv") target = train["Survived"] m = Massager() train_array = m.transform(train, True) brbm = BernoulliRBM(n_components=3, learning_rate=0.01) trantrain = brbm.fit_transform(train_array) param_grid = dict(C=np.logspace(-10, 2, 13), gamma=np.logspace(-9, 3, 13)) grid = GridSearchCV(svm.SVC(), param_grid=param_grid) grid.fit(trantrain, target) C = grid.best_params_['C'] gamma = grid.best_params_['gamma'] classifier = svm.SVC(C=C, gamma=gamma) classifier.fit(trantrain, target) vscore = cross_val_score(classifier, train_array, target) print "Validation score: {0} sd: {1}".format(vscore.mean(), vscore.std()) test = pandas.read_csv("test.csv") answers = pandas.DataFrame(test["PassengerId"]) test_array = m.transform(test) trantest = brbm.transform(test_array)
del rows folds = StratifiedKFold(information, n_folds=3) result = [] for train, test in folds: data_train = sentences[train] result_train = information[train] data_test = sentences[test] result_test = information[test] vectorizer = TfidfVectorizer(binary=True, norm=False, use_idf=False) rbm = BernoulliRBM() classifier = RandomForestClassifier() data_train = vectorizer.fit_transform(data_train) data_test = vectorizer.transform(data_test) data_train = rbm.fit_transform(data_train) data_test = rbm.transform(data_test) classifier.fit(data_train, result_train) print classificationError(classifier.predict(data_test), result_test) result.append(classifier.score(data_test, result_test)) print reduce(lambda x, y: x + y, result) / float(len(result))
minmaxScaler = MinMaxScaler() feature = minmaxScaler.fit_transform(feature) print("Dimensions") print("label", label.shape) print("feature", feature.shape) #feature selection using RBM start_time = time.time() rbm = BernoulliRBM(n_components=params['reduced_feature'], learning_rate=params['learning_rate'], batch_size=params['batchsize'], n_iter=params['n_iter']) feature = rbm.fit_transform(feature) print("RBM--- %s seconds ---" % (time.time() - start_time)) print("Dimensions after RBM") print("label", label.shape) print("feature", feature.shape) combined = np.concatenate((label, feature), axis=1) #resulting dataset after RBM is exported in binary format #dimensions (n_rows, n_columns) are added to the beginning of the binary file. dimension = combined.shape print("Dimension of combined") print(dimension) combined = np.append(dimension, combined)
CMAs = ['Rachelle', 'Karen'] trainSource = CMAs[0] testSource = CMAs[1] def getXYforMultiSet(source): ds, featuresNames = labanUtil.getPybrainDataSet(source) X, Y = labanUtil.fromDStoXY(ds) return X, np.transpose(Y) X, Y = getXYforMultiSet(trainSource) print X X_test, Y_test = getXYforMultiSet(testSource) res = [] params = np.linspace(0.001, 0.1, 10) for p in params: print p rbm = BernoulliRBM(n_components=int(p*X.shape[1]), n_iter=1000) print rbm.fit_transform(X) """ X_small = rbm.fit_transform(X) print X_small.shape clf = linear_model.MultiTaskElasticNetCV() #clf = Pipeline(steps=[('rbm', rbm), ('MultiTaskElasticNetCV', multiClf)]) clf.fit(X_small, Y) print np.array(clf.predict(rbm.transform(X_test))) predTrain = np.array(clf.predict(X_small)) splits = [] for col in range(predTrain.shape[1]): splits.append(getSplitThreshold(predTrain[:, col], Y[:, col])) pred = np.array(clf.predict(rbm.transform(X_test))) for col in range(pred.shape[1]):
#mmodel number 2 #bigMatrixTrain = (bigMatrixTrain - np.min(bigMatrixTrain, 0)) / (np.max(bigMatrixTrain, 0) + 0.0001) # 0-1 scaling #Divide dataset for cross validation purposes X_train, X_test, y_train, y_test = cross_validation.train_test_split( bigMatrixTrain, y, test_size = 0.4, random_state = 0) #fix this # specify parameters and distributions to sample from # Models we will use rbm = BernoulliRBM(random_state=0, verbose=True) #classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) rbm.learning_rate = 0.04 rbm.n_iter = 30 # More components tend to give better prediction performance, but larger fitting time rbm.n_components = 300 X_train = rbm.fit_transform(X_train) X_test = rbm.transform(X_test) # Train a logistic model print("Fitting the classifier to the training set") logisticModel = linear_model.LogisticRegression() t0 = time() param_grid = {'C': [10, 30, 100, 300, 1000]} logisticModel = GridSearchCV(logisticModel, param_grid = param_grid) logisticModel = logisticModel.fit(X_train, y_train) print("done in %0.3fs" % (time() - t0)) print("Best estimator found by grid search:") print(logisticModel.best_estimator_) #logistic.C = 6000.0
print(BC.data.shape) print(BC.target.shape) X = BC.data Y = BC.target Xdata = pd.DataFrame(X) print(Xdata.describe()) X = (X - np.min(X, 0)) / (np.max(X, 0) - np.min(X, 0)) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1) RbmModel = BernoulliRBM(random_state=0, verbose=True) FitRbmModel = RbmModel.fit_transform(X_train, Y_train) LogModel = linear_model.LogisticRegression() LogModel.coef_ = FitRbmModel Classifier = Pipeline(steps=[('RbmModel', RbmModel), ('LogModel', LogModel)]) LogModel.fit(X_train, Y_train) Classifier.fit(X_train, Y_train) print("The RBM model:") print("Predict: ", Classifier.predict(X_test)) print("Real: ", Y_test) CM = ConfusionMatrix(Y_test, Classifier.predict(X_test)) CM.print_stats()
symbol = symbol[pos:pos+2] #scales values in features so that they range from 0 to 1 minmaxScaler = MinMaxScaler() feature = minmaxScaler.fit_transform(feature) print("Dimensions") print("label", label.shape) print("feature", feature.shape) #feature selection using RBM start_time = time.time() rbm = BernoulliRBM(n_components=params['reduced_feature'], learning_rate=params['learning_rate'], batch_size=params['batchsize'], n_iter=params['n_iter']) feature = rbm.fit_transform(feature) print("RBM--- %s seconds ---" % (time.time() - start_time)) print("Dimensions after RBM") print("label", label.shape) print("feature", feature.shape) x_train, x_test, y_train, y_test = train_test_split(feature, label) y_pred = random_forest(x_train, x_test, y_train) filename = 'PRED_'+symbol+'-5.csv' with open(filename, 'wb') as csvfile: writer = csv.writer(csvfile, delimiter=',') for i in range(len(y_pred)): writer.writerow((y_pred[i], y_test[i][0]))
X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001) # 0-1 scaling n_sample, n_feature = X.shape n_split = int(n_sample * 0.8) order = np.random.permutation(n_sample) X_train = X[order][0:n_split, :] Y_train = Y[order][0:n_split] X_test = X[order][n_split:, :] Y_test = Y[order][n_split:] rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.06 rbm.n_iter = 20 rbm.n_components = 100 rbm.fit(X_train, Y_train) X_new = rbm.fit_transform(X) print X_new.shape plt.figure(figsize=(8, 8)) for i in range(100): plt.subplot(10, 10, i + 1) comp = X_new[i:i + 1, :] plt.imshow(comp.reshape((10, 10)), cmap=plt.cm.gray_r) plt.xticks(()) plt.yticks(()) plt.suptitle('100 components extracted by RBM', fontsize=16) plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23) plt.show()
mylist = [] for u in range(len_test_range): clear_output(wait=True) print 'Filling dict' print str(u) + ' of ' + str(len_test_range) mylist += [pd.Series(getPerc(test_range[u:u + len_dataset_pattern]))] mylist = np.array(mylist) # In[ ]: from sklearn.mixture import VBGMM from copy import copy from sklearn.neural_network import BernoulliRBM kmeans = BernoulliRBM() clusters = pd.Series(kmeans.fit_transform(mylist), name='clusters') # In[ ]: c = pd.DataFrame(mylist).join(clusters) c d = c.loc[((c.clusters == c.clusters.iloc[-1]))] d d = d.iloc[:, :-1] for x in range(len(d)): a = 2 d.iloc[[x, -1]].transpose().plot(linewidth=1) d.iloc[[x, -1], :] #.iloc[:,[x,-1]] # In[ ]:
mylist=[] for u in range(len_test_range): clear_output(wait=True) print 'Filling dict' print str(u)+' of '+str(len_test_range) mylist+=[pd.Series(getPerc(test_range[u:u+len_dataset_pattern]))] mylist=np.array(mylist) # In[ ]: from sklearn.mixture import VBGMM from copy import copy from sklearn.neural_network import BernoulliRBM kmeans = BernoulliRBM () clusters=pd.Series(kmeans.fit_transform(mylist),name='clusters') # In[ ]: c=pd.DataFrame(mylist).join(clusters) c d=c.loc[((c.clusters==c.clusters.iloc[-1]))] d d=d.iloc[:,:-1] for x in range(len(d)): a=2 d.iloc[[x,-1]].transpose().plot(linewidth=1) d.iloc[[x,-1],:] #.iloc[:,[x,-1]]