normalize_flag=True), LogisticFunction(binary_flag=False, normalization_mode="min_max", normalize_flag=True), LogisticFunction(binary_flag=False, normalization_mode="min_max", normalize_flag=True) ] # Setting the object for function approximation. approximaion_list = [ContrastiveDivergence(), ContrastiveDivergence()] dbm = StackedAutoEncoder( DBMMultiLayerBuilder(), [observed_arr.shape[1], 10, observed_arr.shape[1]], activation_list, approximaion_list, 1e-05, # Setting learning rate. 0.5 # Setting dropout rate. ) # Execute learning. dbm.learn( observed_arr, 1, # If approximation is the Contrastive Divergence, this parameter is `k` in CD method. batch_size=100, # Batch size in mini-batch training. r_batch_size= -1, # if `r_batch_size` > 0, the function of `dbm.learn` is a kind of reccursive learning. sgd_flag=True) # Extract reconstruction error. reconstruct_error_arr = dbm.get_reconstruct_error_arr(layer_number=0)
0.656428 0.947666 0.409032 0.959559 0.397501 0.353150 0.614216 0.167008 0.424654 0.204616 0.573720 0.147871 0.722278 0.068951 ..... Reconstruct error: [ 0.08297197 0.07091231 0.0823424 ..., 0.0721624 0.08404181 0.06981017] ''' target_arr = np.random.uniform(size=(10000, 10000)) dbm = StackedAutoEncoder( DBMMultiLayerBuilder(), [target_arr.shape[1], 10, target_arr.shape[1]], [SoftmaxFunction(), SoftmaxFunction(), SoftmaxFunction()], [ContrastiveDivergence(), ContrastiveDivergence()], 0.05, 0.5, inferencing_flag=True, inferencing_plan="each") dbm.learn(target_arr, traning_count=1, batch_size=100, r_batch_size=-1) import pandas as pd feature_points_df = pd.DataFrame(dbm.feature_points_arr) print(feature_points_df.shape) print(feature_points_df.head()) print("-" * 100) print(feature_points_df.tail()) print("-" * 100)
from pprint import pprint from sklearn.datasets import make_classification from sklearn.cross_validation import train_test_split if __name__ == "__main__": ''' ''' data_tuple = make_classification(n_samples=20000, n_features=1000, n_informative=5, n_classes=5, class_sep=1.0, scale=0.1) data_tuple_x, data_tuple_y = data_tuple traning_x, test_x, traning_y, test_y = train_test_split(data_tuple_x, data_tuple_y, test_size=0.5, random_state=888) dbm = StackedAutoEncoder(DBMMultiLayerBuilder(), [traning_x.shape[1], 10, traning_x.shape[1]], LogisticFunction(), ContrastiveDivergence(), 0.05) dbm.learn(traning_x, traning_count=1) import pandas as pd feature_points_df = pd.DataFrame(dbm.feature_points_arr) print(feature_points_df.shape) print(feature_points_df.head()) print("-" * 100) print(feature_points_df.tail())
__y = y + _y if __x < 0 or __y < 0: vector = 0 else: try: vector = map_arr[__y][__x] except IndexError: vector = 0 vector_list.append(vector) vector_list_list.append(vector_list) vector_arr = np.array(vector_list_list) vector_arr = vector_arr.astype(float) dbm = StackedAutoEncoder( DBMMultiLayerBuilder(), [vector_arr.shape[1], vector_arr.shape[1], 10], LogisticFunction(), ContrastiveDivergence(), 0.005 ) dbm.learn(vector_arr, traning_count=1) feature_arr = dbm.feature_points_arr feature_arr = feature_arr[:, 0] feature_map_arr = feature_arr.reshape(map_d, map_d) map_arr = map_arr.astype(object) map_arr[:, 0] = wall_label map_arr[0, :] = wall_label map_arr[:, -1] = wall_label map_arr[-1, :] = wall_label map_arr[1][1] = start_point_label map_arr[map_d - 2][map_d - 2] = end_point_label
def __init__(self, token_list, document_list=[], traning_count=100, batch_size=20, learning_rate=1e-05, feature_dim=100): ''' Initialize. Args: token_list: The list of all tokens in all sentences. If the input value is a two-dimensional list, the first-dimensional key represents a sentence number, and the second-dimensional key represents a token number. document_list: The list of document composed by tokens. training_count: The epochs. batch_size: Batch size. learning_rate: Learning rate. feature_dim: The dimension of feature points. ''' pair_dict = {} document_dict = {} self.__token_arr = np.array(token_list) if self.__token_arr.ndim == 2: for i in range(self.__token_arr.shape[0]): for j in range(1, self.__token_arr[i].shape[0] - 1): pair_dict.setdefault( (self.__token_arr[i, j], self.__token_arr[i, j - 1]), 0) pair_dict[(self.__token_arr[i, j], self.__token_arr[i, j - 1])] += 1 pair_dict.setdefault( (self.__token_arr[i, j], self.__token_arr[i, j + 1]), 0) pair_dict[(self.__token_arr[i, j], self.__token_arr[i, j + 1])] += 1 document_dict.setdefault(self.__token_arr[i], []) for d in range(len(document_list)): if self.__token_arr[i, j] in document_list[d]: document_dict[self.__token_arr[i, j]].append(d) elif self.__token_arr.ndim == 1: for i in range(1, self.__token_arr.shape[0] - 1): pair_dict.setdefault( (self.__token_arr[i], self.__token_arr[i - 1]), 0) pair_dict[(self.__token_arr[i], self.__token_arr[i - 1])] += 1 pair_dict.setdefault( (self.__token_arr[i], self.__token_arr[i + 1]), 0) pair_dict[(self.__token_arr[i], self.__token_arr[i + 1])] += 1 document_dict.setdefault(self.__token_arr[i], []) for d in range(len(document_list)): if self.__token_arr[i] in document_list[d]: document_dict[self.__token_arr[i]].append(d) else: raise ValueError() token_list = list(set(self.__token_arr.ravel().tolist())) token_arr = np.zeros((len(token_list), len(token_list))) pair_arr = np.zeros((len(token_list), len(token_list))) document_arr = np.zeros((len(token_list), len(document_list))) for i in range(token_arr.shape[0]): for j in range(token_arr.shape[0]): try: pair_arr[i, j] = pair_dict[(token_list[i], token_list[j])] token_arr[i, j] = 1.0 except: pass if len(document_list) > 0: if token_list[i] in document_dict: for d in document_dict[token_list[i]]: document_arr[i, d] = 1.0 pair_arr = np.exp(pair_arr - pair_arr.max()) pair_arr = pair_arr / pair_arr.sum() pair_arr = (pair_arr - pair_arr.mean()) / (pair_arr.std() + 1e-08) if len(document_list) > 0: document_arr = (document_arr - document_arr.mean()) / (document_arr.std() + 1e-08) token_arr = np.c_[pair_arr, document_arr] token_arr = (token_arr - token_arr.mean()) / (token_arr.std() + 1e-08) self.__dbm = StackedAutoEncoder( DBMMultiLayerBuilder(), [token_arr.shape[1], feature_dim, token_arr.shape[1]], [TanhFunction(), TanhFunction(), TanhFunction()], [ContrastiveDivergence(), ContrastiveDivergence()], learning_rate=learning_rate) self.__dbm.learn(token_arr, traning_count=traning_count, batch_size=batch_size, sgd_flag=True) feature_points_arr = self.__dbm.feature_points_arr self.__token_arr = token_arr self.__token_list = token_list self.__feature_points_arr = feature_points_arr
class DBMLikeSkipGramVectorizer(VectorizableToken): ''' Vectorize token by Deep Bolzmann Machine(DBM). Note that this class employs an original method based on this library-specific intuition and analogy about skip-gram, where by n-grams are still stored to model language, but they allow for tokens to be skipped. ''' def __init__(self, token_list, document_list=[], traning_count=100, batch_size=20, learning_rate=1e-05, feature_dim=100): ''' Initialize. Args: token_list: The list of all tokens in all sentences. If the input value is a two-dimensional list, the first-dimensional key represents a sentence number, and the second-dimensional key represents a token number. document_list: The list of document composed by tokens. training_count: The epochs. batch_size: Batch size. learning_rate: Learning rate. feature_dim: The dimension of feature points. ''' pair_dict = {} document_dict = {} self.__token_arr = np.array(token_list) if self.__token_arr.ndim == 2: for i in range(self.__token_arr.shape[0]): for j in range(1, self.__token_arr[i].shape[0] - 1): pair_dict.setdefault( (self.__token_arr[i, j], self.__token_arr[i, j - 1]), 0) pair_dict[(self.__token_arr[i, j], self.__token_arr[i, j - 1])] += 1 pair_dict.setdefault( (self.__token_arr[i, j], self.__token_arr[i, j + 1]), 0) pair_dict[(self.__token_arr[i, j], self.__token_arr[i, j + 1])] += 1 document_dict.setdefault(self.__token_arr[i], []) for d in range(len(document_list)): if self.__token_arr[i, j] in document_list[d]: document_dict[self.__token_arr[i, j]].append(d) elif self.__token_arr.ndim == 1: for i in range(1, self.__token_arr.shape[0] - 1): pair_dict.setdefault( (self.__token_arr[i], self.__token_arr[i - 1]), 0) pair_dict[(self.__token_arr[i], self.__token_arr[i - 1])] += 1 pair_dict.setdefault( (self.__token_arr[i], self.__token_arr[i + 1]), 0) pair_dict[(self.__token_arr[i], self.__token_arr[i + 1])] += 1 document_dict.setdefault(self.__token_arr[i], []) for d in range(len(document_list)): if self.__token_arr[i] in document_list[d]: document_dict[self.__token_arr[i]].append(d) else: raise ValueError() token_list = list(set(self.__token_arr.ravel().tolist())) token_arr = np.zeros((len(token_list), len(token_list))) pair_arr = np.zeros((len(token_list), len(token_list))) document_arr = np.zeros((len(token_list), len(document_list))) for i in range(token_arr.shape[0]): for j in range(token_arr.shape[0]): try: pair_arr[i, j] = pair_dict[(token_list[i], token_list[j])] token_arr[i, j] = 1.0 except: pass if len(document_list) > 0: if token_list[i] in document_dict: for d in document_dict[token_list[i]]: document_arr[i, d] = 1.0 pair_arr = np.exp(pair_arr - pair_arr.max()) pair_arr = pair_arr / pair_arr.sum() pair_arr = (pair_arr - pair_arr.mean()) / (pair_arr.std() + 1e-08) if len(document_list) > 0: document_arr = (document_arr - document_arr.mean()) / (document_arr.std() + 1e-08) token_arr = np.c_[pair_arr, document_arr] token_arr = (token_arr - token_arr.mean()) / (token_arr.std() + 1e-08) self.__dbm = StackedAutoEncoder( DBMMultiLayerBuilder(), [token_arr.shape[1], feature_dim, token_arr.shape[1]], [TanhFunction(), TanhFunction(), TanhFunction()], [ContrastiveDivergence(), ContrastiveDivergence()], learning_rate=learning_rate) self.__dbm.learn(token_arr, traning_count=traning_count, batch_size=batch_size, sgd_flag=True) feature_points_arr = self.__dbm.feature_points_arr self.__token_arr = token_arr self.__token_list = token_list self.__feature_points_arr = feature_points_arr def vectorize(self, token_list): ''' Tokenize token list. Args: token_list: The list of tokens. Returns: [vector of token, vector of token, vector of token, ...] ''' return [ self.__extract_from_feature(token).tolist() for token in token_list ] def convert_tokens_into_matrix(self, token_list): ''' Create matrix of sentences. Args: token_list: The list of tokens. Returns: 2-D `np.ndarray` of sentences. Each row means one hot vectors of one sentence. ''' return np.array(self.vectorize(token_list)).astype(np.float32) def tokenize(self, vector_list): ''' Tokenize vector. Args: vector_list: The list of vector of one token. Returns: token ''' vector_arr = np.array(vector_list) if vector_arr.ndim == 2 and vector_arr.shape[0] > 1: vector_arr = np.nanmean(vector_arr, axis=0) vector_arr = vector_arr.reshape(1, -1) diff_arr = np.nansum(np.square(vector_arr - self.__feature_points_arr), axis=1) return np.array([self.__token_list[diff_arr.argmin(axis=0)]]) def __extract_from_feature(self, token): try: key = self.__token_list.index(token) arr = self.__feature_points_arr[key] arr = arr.astype(np.float32) except: arr = self.__feature_points_arr.mean(axis=0) return arr def get_token_arr(self): ''' getter ''' return self.__token_arr def set_token_arr(self, value): ''' setter ''' raise TypeError("This property must be read-only.") token_arr = property(get_token_arr, set_token_arr) def set_readonly(self, value): ''' setter ''' raise TypeError("This property must be read-only.") def get_token_list(self): ''' getter ''' return self.__token_list token_list = property(get_token_list, set_readonly)