コード例 #1
0
                     normalize_flag=True),
    LogisticFunction(binary_flag=False,
                     normalization_mode="min_max",
                     normalize_flag=True),
    LogisticFunction(binary_flag=False,
                     normalization_mode="min_max",
                     normalize_flag=True)
]

# Setting the object for function approximation.
approximaion_list = [ContrastiveDivergence(), ContrastiveDivergence()]

dbm = StackedAutoEncoder(
    DBMMultiLayerBuilder(),
    [observed_arr.shape[1], 10, observed_arr.shape[1]],
    activation_list,
    approximaion_list,
    1e-05,  # Setting learning rate.
    0.5  # Setting dropout rate.
)

# Execute learning.
dbm.learn(
    observed_arr,
    1,  # If approximation is the Contrastive Divergence, this parameter is `k` in CD method.
    batch_size=100,  # Batch size in mini-batch training.
    r_batch_size=
    -1,  # if `r_batch_size` > 0, the function of `dbm.learn` is a kind of reccursive learning.
    sgd_flag=True)

# Extract reconstruction error.
reconstruct_error_arr = dbm.get_reconstruct_error_arr(layer_number=0)
コード例 #2
0
            0.656428  0.947666  0.409032  0.959559  0.397501  0.353150  0.614216
            0.167008  0.424654  0.204616  0.573720  0.147871  0.722278  0.068951
            .....

        Reconstruct error:
            [ 0.08297197  0.07091231  0.0823424  ...,  0.0721624   0.08404181  0.06981017]
    '''

    target_arr = np.random.uniform(size=(10000, 10000))

    dbm = StackedAutoEncoder(
        DBMMultiLayerBuilder(), [target_arr.shape[1], 10, target_arr.shape[1]],
        [SoftmaxFunction(),
         SoftmaxFunction(),
         SoftmaxFunction()],
        [ContrastiveDivergence(),
         ContrastiveDivergence()],
        0.05,
        0.5,
        inferencing_flag=True,
        inferencing_plan="each")
    dbm.learn(target_arr, traning_count=1, batch_size=100, r_batch_size=-1)

    import pandas as pd
    feature_points_df = pd.DataFrame(dbm.feature_points_arr)
    print(feature_points_df.shape)
    print(feature_points_df.head())
    print("-" * 100)
    print(feature_points_df.tail())

    print("-" * 100)
コード例 #3
0
from pprint import pprint
from sklearn.datasets import make_classification
from sklearn.cross_validation import train_test_split

if __name__ == "__main__":
    '''
    '''

    data_tuple = make_classification(n_samples=20000,
                                     n_features=1000,
                                     n_informative=5,
                                     n_classes=5,
                                     class_sep=1.0,
                                     scale=0.1)
    data_tuple_x, data_tuple_y = data_tuple
    traning_x, test_x, traning_y, test_y = train_test_split(data_tuple_x,
                                                            data_tuple_y,
                                                            test_size=0.5,
                                                            random_state=888)

    dbm = StackedAutoEncoder(DBMMultiLayerBuilder(),
                             [traning_x.shape[1], 10, traning_x.shape[1]],
                             LogisticFunction(), ContrastiveDivergence(), 0.05)
    dbm.learn(traning_x, traning_count=1)
    import pandas as pd
    feature_points_df = pd.DataFrame(dbm.feature_points_arr)
    print(feature_points_df.shape)
    print(feature_points_df.head())
    print("-" * 100)
    print(feature_points_df.tail())
コード例 #4
0
                    __y = y + _y
                    if __x < 0 or __y < 0:
                        vector = 0
                    else:
                        try:
                            vector = map_arr[__y][__x]
                        except IndexError:
                            vector = 0
                    vector_list.append(vector)
            vector_list_list.append(vector_list)
    vector_arr = np.array(vector_list_list)
    vector_arr = vector_arr.astype(float)
    dbm = StackedAutoEncoder(
        DBMMultiLayerBuilder(),
        [vector_arr.shape[1], vector_arr.shape[1], 10],
        LogisticFunction(),
        ContrastiveDivergence(),
        0.005
    )
    dbm.learn(vector_arr, traning_count=1)
    feature_arr = dbm.feature_points_arr
    feature_arr = feature_arr[:, 0]
    feature_map_arr = feature_arr.reshape(map_d, map_d)

    map_arr = map_arr.astype(object)
    map_arr[:, 0] = wall_label
    map_arr[0, :] = wall_label
    map_arr[:, -1] = wall_label
    map_arr[-1, :] = wall_label
    map_arr[1][1] = start_point_label
    map_arr[map_d - 2][map_d - 2] = end_point_label
コード例 #5
0
    def __init__(self,
                 token_list,
                 document_list=[],
                 traning_count=100,
                 batch_size=20,
                 learning_rate=1e-05,
                 feature_dim=100):
        '''
        Initialize.
        
        Args:
            token_list:         The list of all tokens in all sentences.
                                If the input value is a two-dimensional list, 
                                the first-dimensional key represents a sentence number, 
                                and the second-dimensional key represents a token number.

            document_list:      The list of document composed by tokens.
            training_count:     The epochs.
            batch_size:         Batch size.
            learning_rate:      Learning rate.
            feature_dim:        The dimension of feature points.
        '''
        pair_dict = {}
        document_dict = {}

        self.__token_arr = np.array(token_list)
        if self.__token_arr.ndim == 2:
            for i in range(self.__token_arr.shape[0]):
                for j in range(1, self.__token_arr[i].shape[0] - 1):
                    pair_dict.setdefault(
                        (self.__token_arr[i, j], self.__token_arr[i, j - 1]),
                        0)
                    pair_dict[(self.__token_arr[i, j],
                               self.__token_arr[i, j - 1])] += 1
                    pair_dict.setdefault(
                        (self.__token_arr[i, j], self.__token_arr[i, j + 1]),
                        0)
                    pair_dict[(self.__token_arr[i, j],
                               self.__token_arr[i, j + 1])] += 1
                    document_dict.setdefault(self.__token_arr[i], [])
                    for d in range(len(document_list)):
                        if self.__token_arr[i, j] in document_list[d]:
                            document_dict[self.__token_arr[i, j]].append(d)

        elif self.__token_arr.ndim == 1:
            for i in range(1, self.__token_arr.shape[0] - 1):
                pair_dict.setdefault(
                    (self.__token_arr[i], self.__token_arr[i - 1]), 0)
                pair_dict[(self.__token_arr[i], self.__token_arr[i - 1])] += 1
                pair_dict.setdefault(
                    (self.__token_arr[i], self.__token_arr[i + 1]), 0)
                pair_dict[(self.__token_arr[i], self.__token_arr[i + 1])] += 1

                document_dict.setdefault(self.__token_arr[i], [])
                for d in range(len(document_list)):
                    if self.__token_arr[i] in document_list[d]:
                        document_dict[self.__token_arr[i]].append(d)
        else:
            raise ValueError()

        token_list = list(set(self.__token_arr.ravel().tolist()))

        token_arr = np.zeros((len(token_list), len(token_list)))
        pair_arr = np.zeros((len(token_list), len(token_list)))
        document_arr = np.zeros((len(token_list), len(document_list)))
        for i in range(token_arr.shape[0]):
            for j in range(token_arr.shape[0]):
                try:
                    pair_arr[i, j] = pair_dict[(token_list[i], token_list[j])]
                    token_arr[i, j] = 1.0
                except:
                    pass

            if len(document_list) > 0:
                if token_list[i] in document_dict:
                    for d in document_dict[token_list[i]]:
                        document_arr[i, d] = 1.0

        pair_arr = np.exp(pair_arr - pair_arr.max())
        pair_arr = pair_arr / pair_arr.sum()
        pair_arr = (pair_arr - pair_arr.mean()) / (pair_arr.std() + 1e-08)
        if len(document_list) > 0:
            document_arr = (document_arr -
                            document_arr.mean()) / (document_arr.std() + 1e-08)

            token_arr = np.c_[pair_arr, document_arr]
            token_arr = (token_arr - token_arr.mean()) / (token_arr.std() +
                                                          1e-08)

        self.__dbm = StackedAutoEncoder(
            DBMMultiLayerBuilder(),
            [token_arr.shape[1], feature_dim, token_arr.shape[1]],
            [TanhFunction(), TanhFunction(),
             TanhFunction()],
            [ContrastiveDivergence(),
             ContrastiveDivergence()],
            learning_rate=learning_rate)
        self.__dbm.learn(token_arr,
                         traning_count=traning_count,
                         batch_size=batch_size,
                         sgd_flag=True)
        feature_points_arr = self.__dbm.feature_points_arr
        self.__token_arr = token_arr
        self.__token_list = token_list
        self.__feature_points_arr = feature_points_arr
コード例 #6
0
class DBMLikeSkipGramVectorizer(VectorizableToken):
    '''
    Vectorize token by Deep Bolzmann Machine(DBM).

    Note that this class employs an original method 
    based on this library-specific intuition and analogy about skip-gram,
    where by n-grams are still stored to model language, 
    but they allow for tokens to be skipped.
    '''
    def __init__(self,
                 token_list,
                 document_list=[],
                 traning_count=100,
                 batch_size=20,
                 learning_rate=1e-05,
                 feature_dim=100):
        '''
        Initialize.
        
        Args:
            token_list:         The list of all tokens in all sentences.
                                If the input value is a two-dimensional list, 
                                the first-dimensional key represents a sentence number, 
                                and the second-dimensional key represents a token number.

            document_list:      The list of document composed by tokens.
            training_count:     The epochs.
            batch_size:         Batch size.
            learning_rate:      Learning rate.
            feature_dim:        The dimension of feature points.
        '''
        pair_dict = {}
        document_dict = {}

        self.__token_arr = np.array(token_list)
        if self.__token_arr.ndim == 2:
            for i in range(self.__token_arr.shape[0]):
                for j in range(1, self.__token_arr[i].shape[0] - 1):
                    pair_dict.setdefault(
                        (self.__token_arr[i, j], self.__token_arr[i, j - 1]),
                        0)
                    pair_dict[(self.__token_arr[i, j],
                               self.__token_arr[i, j - 1])] += 1
                    pair_dict.setdefault(
                        (self.__token_arr[i, j], self.__token_arr[i, j + 1]),
                        0)
                    pair_dict[(self.__token_arr[i, j],
                               self.__token_arr[i, j + 1])] += 1
                    document_dict.setdefault(self.__token_arr[i], [])
                    for d in range(len(document_list)):
                        if self.__token_arr[i, j] in document_list[d]:
                            document_dict[self.__token_arr[i, j]].append(d)

        elif self.__token_arr.ndim == 1:
            for i in range(1, self.__token_arr.shape[0] - 1):
                pair_dict.setdefault(
                    (self.__token_arr[i], self.__token_arr[i - 1]), 0)
                pair_dict[(self.__token_arr[i], self.__token_arr[i - 1])] += 1
                pair_dict.setdefault(
                    (self.__token_arr[i], self.__token_arr[i + 1]), 0)
                pair_dict[(self.__token_arr[i], self.__token_arr[i + 1])] += 1

                document_dict.setdefault(self.__token_arr[i], [])
                for d in range(len(document_list)):
                    if self.__token_arr[i] in document_list[d]:
                        document_dict[self.__token_arr[i]].append(d)
        else:
            raise ValueError()

        token_list = list(set(self.__token_arr.ravel().tolist()))

        token_arr = np.zeros((len(token_list), len(token_list)))
        pair_arr = np.zeros((len(token_list), len(token_list)))
        document_arr = np.zeros((len(token_list), len(document_list)))
        for i in range(token_arr.shape[0]):
            for j in range(token_arr.shape[0]):
                try:
                    pair_arr[i, j] = pair_dict[(token_list[i], token_list[j])]
                    token_arr[i, j] = 1.0
                except:
                    pass

            if len(document_list) > 0:
                if token_list[i] in document_dict:
                    for d in document_dict[token_list[i]]:
                        document_arr[i, d] = 1.0

        pair_arr = np.exp(pair_arr - pair_arr.max())
        pair_arr = pair_arr / pair_arr.sum()
        pair_arr = (pair_arr - pair_arr.mean()) / (pair_arr.std() + 1e-08)
        if len(document_list) > 0:
            document_arr = (document_arr -
                            document_arr.mean()) / (document_arr.std() + 1e-08)

            token_arr = np.c_[pair_arr, document_arr]
            token_arr = (token_arr - token_arr.mean()) / (token_arr.std() +
                                                          1e-08)

        self.__dbm = StackedAutoEncoder(
            DBMMultiLayerBuilder(),
            [token_arr.shape[1], feature_dim, token_arr.shape[1]],
            [TanhFunction(), TanhFunction(),
             TanhFunction()],
            [ContrastiveDivergence(),
             ContrastiveDivergence()],
            learning_rate=learning_rate)
        self.__dbm.learn(token_arr,
                         traning_count=traning_count,
                         batch_size=batch_size,
                         sgd_flag=True)
        feature_points_arr = self.__dbm.feature_points_arr
        self.__token_arr = token_arr
        self.__token_list = token_list
        self.__feature_points_arr = feature_points_arr

    def vectorize(self, token_list):
        '''
        Tokenize token list.
        
        Args:
            token_list:   The list of tokens.
        
        Returns:
            [vector of token, vector of token, vector of token, ...]
        '''
        return [
            self.__extract_from_feature(token).tolist() for token in token_list
        ]

    def convert_tokens_into_matrix(self, token_list):
        '''
        Create matrix of sentences.

        Args:
            token_list:     The list of tokens.
        
        Returns:
            2-D `np.ndarray` of sentences.
            Each row means one hot vectors of one sentence.
        '''
        return np.array(self.vectorize(token_list)).astype(np.float32)

    def tokenize(self, vector_list):
        '''
        Tokenize vector.

        Args:
            vector_list:    The list of vector of one token.
        
        Returns:
            token
        '''
        vector_arr = np.array(vector_list)
        if vector_arr.ndim == 2 and vector_arr.shape[0] > 1:
            vector_arr = np.nanmean(vector_arr, axis=0)

        vector_arr = vector_arr.reshape(1, -1)
        diff_arr = np.nansum(np.square(vector_arr - self.__feature_points_arr),
                             axis=1)
        return np.array([self.__token_list[diff_arr.argmin(axis=0)]])

    def __extract_from_feature(self, token):
        try:
            key = self.__token_list.index(token)
            arr = self.__feature_points_arr[key]
            arr = arr.astype(np.float32)
        except:
            arr = self.__feature_points_arr.mean(axis=0)
        return arr

    def get_token_arr(self):
        ''' getter '''
        return self.__token_arr

    def set_token_arr(self, value):
        ''' setter '''
        raise TypeError("This property must be read-only.")

    token_arr = property(get_token_arr, set_token_arr)

    def set_readonly(self, value):
        ''' setter '''
        raise TypeError("This property must be read-only.")

    def get_token_list(self):
        ''' getter '''
        return self.__token_list

    token_list = property(get_token_list, set_readonly)