LogisticFunction(binary_flag=False, normalization_mode="min_max", normalize_flag=True), LogisticFunction(binary_flag=False, normalization_mode="min_max", normalize_flag=True), LogisticFunction(binary_flag=False, normalization_mode="min_max", normalize_flag=True) ] # Setting the object for function approximation. approximaion_list = [ContrastiveDivergence(), ContrastiveDivergence()] dbm = StackedAutoEncoder( DBMMultiLayerBuilder(), [observed_arr.shape[1], 10, observed_arr.shape[1]], activation_list, approximaion_list, 1e-05, # Setting learning rate. 0.5 # Setting dropout rate. ) # Execute learning. dbm.learn( observed_arr, 1, # If approximation is the Contrastive Divergence, this parameter is `k` in CD method. batch_size=100, # Batch size in mini-batch training. r_batch_size= -1, # if `r_batch_size` > 0, the function of `dbm.learn` is a kind of reccursive learning. sgd_flag=True)
Feature points: 0.190599 0.183594 0.482996 0.911710 0.939766 0.202852 0.042163 0.470003 0.104970 0.602966 0.927917 0.134440 0.600353 0.264248 0.419805 0.158642 0.328253 0.163071 0.017190 0.982587 0.779166 0.656428 0.947666 0.409032 0.959559 0.397501 0.353150 0.614216 0.167008 0.424654 0.204616 0.573720 0.147871 0.722278 0.068951 ..... Reconstruct error: [ 0.08297197 0.07091231 0.0823424 ..., 0.0721624 0.08404181 0.06981017] ''' target_arr = np.random.uniform(size=(10000, 10000)) dbm = StackedAutoEncoder( DBMMultiLayerBuilder(), [target_arr.shape[1], 10, target_arr.shape[1]], [SoftmaxFunction(), SoftmaxFunction(), SoftmaxFunction()], [ContrastiveDivergence(), ContrastiveDivergence()], 0.05, 0.5, inferencing_flag=True, inferencing_plan="each") dbm.learn(target_arr, traning_count=1, batch_size=100, r_batch_size=-1) import pandas as pd feature_points_df = pd.DataFrame(dbm.feature_points_arr) print(feature_points_df.shape) print(feature_points_df.head())
from pydbm.activation.logistic_function import LogisticFunction import numpy as np import random import pandas as pd from pprint import pprint from sklearn.datasets import make_classification from sklearn.cross_validation import train_test_split if __name__ == "__main__": ''' ''' data_tuple = make_classification(n_samples=20000, n_features=1000, n_informative=5, n_classes=5, class_sep=1.0, scale=0.1) data_tuple_x, data_tuple_y = data_tuple traning_x, test_x, traning_y, test_y = train_test_split(data_tuple_x, data_tuple_y, test_size=0.5, random_state=888) dbm = DeepBoltzmannMachine(DBMMultiLayerBuilder(), [traning_x.shape[1], 10, traning_x.shape[1]], LogisticFunction(), ContrastiveDivergence(), 0.05) dbm.learn(traning_x, traning_count=1) print(dbm.get_feature_point_list(0))
from pprint import pprint from sklearn.datasets import make_classification from sklearn.cross_validation import train_test_split if __name__ == "__main__": ''' ''' data_tuple = make_classification(n_samples=20000, n_features=1000, n_informative=5, n_classes=5, class_sep=1.0, scale=0.1) data_tuple_x, data_tuple_y = data_tuple traning_x, test_x, traning_y, test_y = train_test_split(data_tuple_x, data_tuple_y, test_size=0.5, random_state=888) dbm = StackedAutoEncoder(DBMMultiLayerBuilder(), [traning_x.shape[1], 10, traning_x.shape[1]], LogisticFunction(), ContrastiveDivergence(), 0.05) dbm.learn(traning_x, traning_count=1) import pandas as pd feature_points_df = pd.DataFrame(dbm.feature_points_arr) print(feature_points_df.shape) print(feature_points_df.head()) print("-" * 100) print(feature_points_df.tail())
def __init__(self, token_list, document_list=[], traning_count=100, batch_size=20, learning_rate=1e-05, feature_dim=100): ''' Initialize. Args: token_list: The list of all tokens in all sentences. If the input value is a two-dimensional list, the first-dimensional key represents a sentence number, and the second-dimensional key represents a token number. document_list: The list of document composed by tokens. training_count: The epochs. batch_size: Batch size. learning_rate: Learning rate. feature_dim: The dimension of feature points. ''' pair_dict = {} document_dict = {} self.__token_arr = np.array(token_list) if self.__token_arr.ndim == 2: for i in range(self.__token_arr.shape[0]): for j in range(1, self.__token_arr[i].shape[0] - 1): pair_dict.setdefault( (self.__token_arr[i, j], self.__token_arr[i, j - 1]), 0) pair_dict[(self.__token_arr[i, j], self.__token_arr[i, j - 1])] += 1 pair_dict.setdefault( (self.__token_arr[i, j], self.__token_arr[i, j + 1]), 0) pair_dict[(self.__token_arr[i, j], self.__token_arr[i, j + 1])] += 1 document_dict.setdefault(self.__token_arr[i], []) for d in range(len(document_list)): if self.__token_arr[i, j] in document_list[d]: document_dict[self.__token_arr[i, j]].append(d) elif self.__token_arr.ndim == 1: for i in range(1, self.__token_arr.shape[0] - 1): pair_dict.setdefault( (self.__token_arr[i], self.__token_arr[i - 1]), 0) pair_dict[(self.__token_arr[i], self.__token_arr[i - 1])] += 1 pair_dict.setdefault( (self.__token_arr[i], self.__token_arr[i + 1]), 0) pair_dict[(self.__token_arr[i], self.__token_arr[i + 1])] += 1 document_dict.setdefault(self.__token_arr[i], []) for d in range(len(document_list)): if self.__token_arr[i] in document_list[d]: document_dict[self.__token_arr[i]].append(d) else: raise ValueError() token_list = list(set(self.__token_arr.ravel().tolist())) token_arr = np.zeros((len(token_list), len(token_list))) pair_arr = np.zeros((len(token_list), len(token_list))) document_arr = np.zeros((len(token_list), len(document_list))) for i in range(token_arr.shape[0]): for j in range(token_arr.shape[0]): try: pair_arr[i, j] = pair_dict[(token_list[i], token_list[j])] token_arr[i, j] = 1.0 except: pass if len(document_list) > 0: if token_list[i] in document_dict: for d in document_dict[token_list[i]]: document_arr[i, d] = 1.0 pair_arr = np.exp(pair_arr - pair_arr.max()) pair_arr = pair_arr / pair_arr.sum() pair_arr = (pair_arr - pair_arr.mean()) / (pair_arr.std() + 1e-08) if len(document_list) > 0: document_arr = (document_arr - document_arr.mean()) / (document_arr.std() + 1e-08) token_arr = np.c_[pair_arr, document_arr] token_arr = (token_arr - token_arr.mean()) / (token_arr.std() + 1e-08) self.__dbm = StackedAutoEncoder( DBMMultiLayerBuilder(), [token_arr.shape[1], feature_dim, token_arr.shape[1]], [TanhFunction(), TanhFunction(), TanhFunction()], [ContrastiveDivergence(), ContrastiveDivergence()], learning_rate=learning_rate) self.__dbm.learn(token_arr, traning_count=traning_count, batch_size=batch_size, sgd_flag=True) feature_points_arr = self.__dbm.feature_points_arr self.__token_arr = token_arr self.__token_list = token_list self.__feature_points_arr = feature_points_arr