def fit(self, X, y=None):
     # Compute the codebook
     self.codebook = Codebook(K=self.K,
                              no_dump=self.no_dump_codebook,
                              force_reload=self.force_reload)
     self.codebook.fit(X)
     return self
class BoVWextractor(BaseEstimator):
    def __init__(self, K=512, no_dump_codebook=False, force_reload=False):
        self.K = K
        self.no_dump_codebook=no_dump_codebook
        self.force_reload = force_reload

    def fit(self, X, y=None):
        # Compute the codebook
        self.codebook = Codebook(K=self.K,no_dump=self.no_dump_codebook, force_reload=self.force_reload)
        self.codebook.fit(X['descriptors'])
        return self

    def transform(self, X):
        print 'Getting BoVW representation'
        init = time.time()

        descriptors = X['descriptors']

        visual_words = np.zeros((len(descriptors), self.K), dtype=np.float32)
        for i in xrange(len(descriptors)):
            words = self.codebook.predict(descriptors[i])
            visual_words[i, :] = np.bincount(words, minlength=self.K)

        end = time.time()
        print '\tDone in ' + str(end - init) + ' secs.'
        return visual_words
class SpatialPyramids(BaseEstimator):
    def __init__(self, K=512, num_levels=3, no_dump_codebook=False):
        self.K = K
        self.num_levels = num_levels
        self.no_dump_codebook = no_dump_codebook

    def fit(self, X, y=None):
        # Compute the codebook
        self.codebook = Codebook(K=self.K, no_dump=self.no_dump_codebook)
        self.codebook.fit(X['descriptors'])
        return self

    def transform(self, X):
        print 'Getting Spatial Pyramid representation'
        init = time.time()

        descriptors = X['descriptors']
        positions = X['positions']
        imsizes = X['imsizes']

        # Num. of cols/rows for each pyramid level
        grid_ncolsrows = 2**np.arange(self.num_levels)

        visual_words = np.zeros(
            (len(descriptors), self.K * np.sum(grid_ncolsrows**2)),
            dtype=np.float32)
        for im in xrange(len(descriptors)):
            # Compute the words
            words = self.codebook.predict(descriptors[im])

            # Compute the bincount for each grid cell in each pyramid level
            current_vw = []
            for l in range(self.num_levels):
                r_vec = np.linspace(0,
                                    imsizes[im][0] + 1,
                                    num=grid_ncolsrows[l] + 1)
                c_vec = np.linspace(0,
                                    imsizes[im][1] + 1,
                                    num=grid_ncolsrows[l] + 1)
                for i in range(grid_ncolsrows[l]):
                    for j in range(grid_ncolsrows[l]):
                        rb = np.logical_and(positions[im][:, 0] >= r_vec[i],
                                            positions[im][:, 0] < r_vec[i + 1])
                        cb = np.logical_and(positions[im][:, 1] >= c_vec[j],
                                            positions[im][:, 1] < c_vec[j + 1])
                        current_vw.extend(
                            np.bincount(words[np.logical_and(rb, cb)],
                                        minlength=self.K))

            # Save the computed values
            visual_words[im, :] = current_vw

        end = time.time()
        print '\tDone in ' + str(end - init) + ' secs.'
        return visual_words
예제 #4
0
    def train(self,
              instances,
              dev_set=None,
              max_epoch=30,
              learning_rate=.5,
              batch_size=30):
        # Construct a statistical model from labeled instances.

        self.codebook = Codebook()
        self.codebook.supervised_populate(instances)

        self.parameters = np.zeros((self.codebook.dimension()))
        self._train_sgd(instances, dev_set, max_epoch, learning_rate,
                        batch_size)
    def __init__(self,
                 data,
                 neighborhood,
                 normalizer=None,
                 mapsize=None,
                 mask=None,
                 mapshape='planar',
                 lattice='rect',
                 initialization='pca',
                 training='batch',
                 name='sompy',
                 component_names=None):
        """
        Self Organizing Map

        :param data: data to be clustered, represented as a matrix of n rows,
            as inputs and m cols as input features
        :param neighborhood: neighborhood object calculator.
        :param normalizer: normalizer object calculator.
        :param mapsize: tuple/list defining the dimensions of the som. If
            single number is provided is considered as the number of nodes.
        :param mask: mask
        :param mapshape: shape of the som.
        :param lattice: type of lattice.
        :param initialization: method to be used for initialization of the som.
        :param name: name used to identify the som
        :param training: Training mode (seq, batch)
        """
        self._data = normalizer.normalize(data) if normalizer else data
        self._normalizer = normalizer
        self._dim = data.shape[1]
        self._dlen = data.shape[0]
        self._dlabel = None
        self._bmu = None

        self.name = name
        self.data_raw = data
        self.neighborhood = neighborhood
        self.mapshape = mapshape
        self.initialization = initialization
        self.mask = mask or np.ones([1, self._dim])
        mapsize = self.calculate_map_size(lattice) if not mapsize else mapsize
        self.codebook = Codebook(mapsize, lattice)
        self.training = training
        self._component_names = self.build_component_names(
        ) if component_names is None else [component_names]
        self._distance_matrix = self.calculate_map_dist()
예제 #6
0
    def __init__(self, shape):
        self.codebooks = []
        self.height = shape[1]
        self.width = shape[0]

        for i in range(self.width):
            for j in range(self.height):
                self.codebooks.append(Codebook())
예제 #7
0
    def __init__(self, Y, D, sigma, b=1e-8, scale=1):

        self.input_shape = Y.shape
        self.sigma = sigma
        self.scale = scale

        m, r, c = D.shape
        _, _, self.h, self.w = Y.shape
        self.hx = int(np.ceil(self.h * scale)) if self.h > 1 else self.h
        self.wx = int(np.ceil(self.w * scale)) if self.w > 1 else self.w

        bh = int(2 * np.ceil(3 * sigma[0] * scale) + 1) if self.h > 1 else 1
        bw = int(2 * np.ceil(3 * sigma[1] * scale) + 1) if self.w > 1 else 1
        self.psf_support_scaled = (bh, bw)

        bh = int(2 * np.ceil(3 * sigma[0]) + 1) if self.h > 1 else 1
        bw = int(2 * np.ceil(3 * sigma[1]) + 1) if self.w > 1 else 1
        self.psf_support = (bh, bw)

        # Set up X
        x_shape = (m, self.hx, self.wx)
        self.X = torch.ones(x_shape).float()

        # Set up Y
        self.Y = torch.tensor(Y).float().flatten(start_dim=0, end_dim=1)

        # Set up b
        self.b = torch.tensor(b).float()
        if self.b.ndim == 4:
            self.b = self.b.flatten(start_dim=0, end_dim=1)

        # Set up codebook
        self.codebook = Codebook(D)

        # Set up spatial blurr
        self.psf = PSF((self.h, self.w), sigma, scale)

        # Prepare constant
        ones_channels = torch.ones((r * c, 1, 1))
        ones_space = torch.ones((1, self.h, self.w))
        self.denominator =  self.codebook.matmul_t(ones_channels) * \
                            self.psf.matmul_t(ones_space)

        # Compute Yhat = DXG
        self.Yhat = self.__forward(self.X)
예제 #8
0
    def __init__(self,
                 data,
                 neighborhood,
                 normalizer=None,
                 mapsize=None,
                 mask=None,
                 mapshape='planar',
                 lattice='rect',
                 initialization='pca',
                 training='batch',
                 name='sompy'):
        """
        Self Organizing Map

        :param data: data to be clustered, represented as a matrix of n rows, as inputs and m cols as input features
        :param neighborhood: neighborhood object calculator.
        :param normalizer: normalizer object calculator.
        :param mapsize: tuple/list defining the dimensions of the som. If single number is provided is considered as the number of nodes.
        :param mask: mask
        :param mapshape: shape of the som.
        :param lattice: type of lattice.
        :param initialization: method to be used for initialization of the som.
        :param name: name used to identify the som
        :param training: Training mode (seq, batch)
        """
        self._data = normalizer.normalize(data) if normalizer else data
        self._normalizer = normalizer
        self._dim = data.shape[1]
        self._dlen = data.shape[0]
        self._dlabel = None
        self._bmu = None

        self.name = name
        self.data_raw = data
        self.neighborhood = neighborhood
        self.mapshape = mapshape
        self.initialization = initialization
        self.mask = mask or np.ones([1, self._dim])
        self.codebook = Codebook(mapsize, lattice)
        self.training = training

        self._component_names = self.build_component_names()
        self._distance_matrix = self.calculate_map_dist()
예제 #9
0
                 "../images/Crowd_PETS09/S0/Background/View_001/Time_13-06/"))
EPSILON_1 = 1
TRAIN_N_IMG = 10

if __name__ == '__main__':

    img_list = os.listdir(TRAINING_PATH)[:TRAIN_N_IMG]

    for it, file in enumerate(img_list):

        img = cv2.cvtColor(cv2.imread(os.path.join(TRAINING_PATH, file)),
                           cv2.COLOR_BGR2RGB).astype(float)

        if it == 0:
            codebooks = np.array(
                [Codebook() for pixel in range(img.shape[0] * img.shape[1])])

        for px_row in img:
            for px_idx, px in enumerate(px_row):

                R = px[0]
                G = px[1]
                B = px[2]
                X = np.array([R, G, B])
                I = sqrt(R**2 + G**2 + B**2)
                pixel = Pixel(X, I)

                codebook_empty = True
                no_match = True

                for codeword in codebooks[px_idx].codewords:
예제 #10
0
    def codebook(self):
        """ Export a code book of categories and codes.
        """

        Codebook(self.settings, self.ui.textEdit)
 def fit(self, X, y=None):
     # Compute the codebook
     self.codebook = Codebook(K=self.K, no_dump=self.no_dump_codebook)
     self.codebook.fit(X['descriptors'])
     return self
예제 #12
0
파일: sompy.py 프로젝트: dongzhiming/SMEA
class SOM(object):
    def __init__(self,
                 data,
                 neighborhood,
                 normalizer=None,
                 mapsize=None,
                 mask=None,
                 mapshape='planar',
                 lattice='rect',
                 initialization='pca',
                 training='batch',
                 name='sompy'):
        """
        Self Organizing Map

        :param data: data to be clustered, represented as a matrix of n rows,
            as inputs and m cols as input features
        :param neighborhood: neighborhood object calculator.
        :param normalizer: normalizer object calculator.
        :param mapsize: tuple/list defining the dimensions of the som. If
            single number is provided is considered as the number of nodes.
        :param mask: mask
        :param mapshape: shape of the som.
        :param lattice: type of lattice.
        :param initialization: method to be used for initialization of the som.
        :param name: name used to identify the som
        :param training: Training mode (seq, batch)
        """
        print "DDDD", data
        self._data = data  # normalizer.normalize(data) if normalizer else data
        print "data  : \n", self._data
        print "map size  : ", mapsize

        self._normalizer = normalizer
        print "_normalizer is a object to class normalizer : ", self._normalizer
        self._dim = data.shape[1]
        print "data shape[1]  :  ", self._dim
        self._dlen = data.shape[0]
        print "data shape[0] : ", data.shape[0]
        self._dlabel = None
        print "data label  : ", self._dlabel
        self._bmu = None
        print "intial bmu  :  ", self._bmu
        self.name = name
        print "name   :  ", self.name
        self.data_raw = data
        print "data_raw  : ", self.data_raw
        #print "data_raw  : ", self.data_raw
        self.neighborhood = neighborhood
        self.mapshape = mapshape
        print "mapshape : ", self.mapshape
        self.initialization = initialization
        print "initilization : ", self.initialization
        self.mask = mask or np.ones([1, self._dim])
        print "mask  : ", self.mask
        self.codebook = Codebook(mapsize, lattice)
        print "--------codebook is a object to class Codebook---------\n", self.codebook
        self.training = training
        print "training mode  : ", self.training

        self._component_names = self.build_component_names()
        print "Initial component name  : ", self._component_names
        self._distance_matrix = self.calculate_map_dist()
        print "initial distance matrix  : ", self._distance_matrix
        #self._data_labels=self.build_data_labels()
        #print "data labels : ", self._data_labels

    @property
    def component_names(self):
        return self._component_names

    @component_names.setter
    def component_names(self, compnames):
        if self._dim == len(compnames):
            self._component_names = np.asarray(compnames)[np.newaxis, :]
        else:
            raise ComponentNamesError('Component names should have the same '
                                      'size as the data dimension/features')

    def build_component_names(self):
        cc = ['Variable-' + str(i + 1) for i in range(0, self._dim)]
        return np.asarray(cc)[np.newaxis, :]

    @property
    def data_labels(self):
        return self._dlabel

    @data_labels.setter
    def data_labels(self, labels):
        """
        Set labels of the training data, it should be in the format of a list
        of strings
        """
        if labels.shape == (1, self._dlen):
            label = labels.T
        elif labels.shape == (self._dlen, 1):
            label = labels
        elif labels.shape == (self._dlen, ):
            label = labels[:, np.newaxis]
        else:
            raise LabelsError('wrong label format')

        self._dlabel = label

    def build_data_labels(self):
        cc = ['dlabel-' + str(i) for i in range(0, self._dlen)]
        return np.asarray(cc)[:, np.newaxis]

    def calculate_map_dist(self):
        """
        Calculates the grid distance, which will be used during the training
        steps. It supports only planar grids for the moment
        """
        nnodes = self.codebook.nnodes

        distance_matrix = np.zeros((nnodes, nnodes))
        for i in range(nnodes):
            distance_matrix[i] = self.codebook.grid_dist(i).reshape(1, nnodes)

        return distance_matrix

    @timeit()
    def train(self, n_job=1, shared_memory=False, verbose='info'):
        """
        Trains the som

        :param n_job: number of jobs to use to parallelize the traning
        :param shared_memory: flag to active shared memory
        :param verbose: verbosity, could be 'debug', 'info' or None
        """
        logging.root.setLevel(
            getattr(logging, verbose.upper()) if verbose else logging.ERROR)

        logging.info(" Training...")
        logging.debug((
            "--------------------------------------------------------------\n"
            " details: \n"
            "      > data len is {data_len} and data dimension is {data_dim}\n"
            "      > map size is {mpsz0},{mpsz1}\n"
            "      > array size in log10 scale is {array_size}\n"
            "      > number of jobs in parallel: {n_job}\n"
            " -------------------------------------------------------------\n"
        ).format(data_len=self._dlen,
                 data_dim=self._dim,
                 mpsz0=self.codebook.mapsize[0],
                 mpsz1=self.codebook.mapsize[1],
                 array_size=np.log10(self._dlen * self.codebook.nnodes *
                                     self._dim),
                 n_job=n_job))

        if self.initialization == 'random':
            self.codebook.random_initialization(self._data)

        elif self.initialization == 'pca':
            self.codebook.pca_linear_initialization(self._data)

        self.rough_train(njob=n_job, shared_memory=shared_memory)
        self.finetune_train(njob=n_job, shared_memory=shared_memory)

        logging.debug(
            " --------------------------------------------------------------")
        logging.info(" Final quantization error: %f" % np.mean(self._bmu[1]))

    def _calculate_ms_and_mpd(self):
        print "--------------------------------------------"
        print "_calculation of ms and mpd  .........\n"
        mn = np.min(self.codebook.mapsize)
        max_s = max(self.codebook.mapsize[0], self.codebook.mapsize[1])

        if mn == 1:
            mpd = float(self.codebook.nnodes * 10) / float(self._dlen)
        else:
            mpd = float(self.codebook.nnodes) / float(self._dlen)
        ms = max_s / 2.0 if mn == 1 else max_s
        print "ms , mpd  : ", ms, mpd

        return ms, mpd

    def rough_train(self, njob=1, shared_memory=False):
        print "--------------------------------------------"
        logging.info(" Rough training...")
        #print "rough training start here ---------------\n"

        ms, mpd = self._calculate_ms_and_mpd()

        trainlen, radiusin, radiusfin = int(np.ceil(30 * mpd)), None, None

        if self.initialization == 'random':
            radiusin = max(1, np.ceil(ms / 3.))
            radiusfin = max(1, radiusin / 6.)
            print "trainlen, radiusin , radiusfin if intialization is random  ", trainlen, radiusin, radiusfin

        elif self.initialization == 'pca':
            radiusin = max(1, np.ceil(ms / 8.))
            radiusfin = max(1, radiusin / 4.)
            print "trainlen, radiusin , radiusfin if intialization is pca", trainlen, radiusin, radiusfin

        self._batchtrain(trainlen, radiusin, radiusfin, njob, shared_memory)
        print "rough training END here ---------------\n"
        print "++++++++++++++++++++++++++++++++++++++++++++"

    def finetune_train(self, njob=1, shared_memory=False):
        print "#################################################--START"
        logging.info(" Finetune training...")

        ms, mpd = self._calculate_ms_and_mpd()

        trainlen, radiusin, radiusfin = None, None, None

        if self.initialization == 'random':
            trainlen = int(np.ceil(50 * mpd))
            radiusin = max(1, ms / 12.)  # from radius fin in rough training
            radiusfin = max(1, radiusin / 25.)
            print "trainlen, radiusin , radiusfin if intialization is random  ", trainlen, radiusin, radiusfin

        elif self.initialization == 'pca':
            trainlen = int(np.ceil(40 * mpd))
            radiusin = max(1, np.ceil(ms / 8.) / 4)
            radiusfin = 1  # max(1, ms/128)
            print "trainlen, radiusin , radiusfin if intialization is pca", trainlen, radiusin, radiusfin

        self._batchtrain(trainlen, radiusin, radiusfin, njob, shared_memory)
        print "#################################################--END"

    def _batchtrain(self,
                    trainlen,
                    radiusin,
                    radiusfin,
                    njob=1,
                    shared_memory=False):
        print "--------------------------------------------"
        radius = np.linspace(radiusin, radiusfin, trainlen)
        print "radius  : ", radius

        if shared_memory:
            data = self._data
            data_folder = tempfile.mkdtemp()
            data_name = os.path.join(data_folder, 'data')
            dump(data, data_name)
            data = load(data_name, mmap_mode='r')

        else:
            data = self._data
            print "data  :  ", data

        bmu = None

        # X2 is part of euclidean distance (x-y)^2 = x^2 +y^2 - 2xy that we use
        # for each data row in bmu finding.
        # Since it is a fixed value we can skip it during bmu finding for each
        # data point, but later we need it calculate quantification error
        fixed_euclidean_x2 = np.einsum('ij,ij->i', data, data)
        #print "fixed_euclidean_x2 : ", fixed_euclidean_x2
        #print "len of fixed_euclidean_x2 : ", len(fixed_euclidean_x2)
        #print data

        logging.info(" radius_ini: %f , radius_final: %f, trainlen: %d\n" %
                     (radiusin, radiusfin, trainlen))

        print "traing starts here up to trainlen................."
        for i in range(trainlen):
            t1 = time()
            neighborhood = self.neighborhood.calculate(self._distance_matrix,
                                                       radius[i],
                                                       self.codebook.nnodes)
            #print "neighbour \n: ", neighborhood
            #print "CODEBOOK MATRIX  \n: ", self.codebook.matrix

            bmu = self.find_bmu(data, njb=njob)
            #print "bmu  \n: ", bmu
            print "intial codebook matrix   :  \n", self.codebook.matrix
            self.codebook.matrix = self.update_codebook_voronoi(
                data, bmu, neighborhood)
            #print "codebook matrix after updation  :  \n", self.codebook.matrix
            print "updated  codebook matrix   :  \n", self.codebook.matrix

            qerror = (i + 1, round(time() - t1, 3),
                      np.mean(np.sqrt(bmu[1] + fixed_euclidean_x2)))
            logging.info(
                " epoch: %d ---> elapsed time:  %f, quantization error: %f\n" %
                qerror)
            break

        bmu[1] = np.sqrt(bmu[1] + fixed_euclidean_x2)
        self._bmu = bmu
        print "bmu after training  :  ", self._bmu
        print "++++++++++++++++++++++++++++++++++++++++++++"

    @timeit(logging.DEBUG)
    def find_bmu(self, input_matrix, njb=1):
        """
        Finds the best matching unit (bmu) for each input data from the input
        matrix. It does all at once parallelizing the calculation instead of
        going through each input and running it against the codebook.

        :param input_matrix: numpy matrix representing inputs as rows and
            features/dimension as cols
        :param njb: number of jobs to parallelize the search
        :returns: the best matching unit for each input
        """
        print "--------------------------------------------"
        dlen = input_matrix.shape[0]
        #print "codebook matrix  : \n ", self.codebook.matrix
        y2 = np.einsum('ij,ij->i', self.codebook.matrix, self.codebook.matrix)
        #print "np.einsum('ij,ij->i', self.codebook.matrix, self.codebook.matrix)  :\n", np.einsum('ij,ij->i', self.codebook.matrix, self.codebook.matrix)
        parallelizer = Parallel(n_jobs=njb, pre_dispatch='3*n_jobs')
        chunk_bmu_finder = delayed(_chunk_based_bmu_find)

        def row_chunk(part):
            return part * dlen // njb

        def col_chunk(part):
            return min((part + 1) * dlen // njb, dlen)

        b = parallelizer(
            chunk_bmu_finder(input_matrix[row_chunk(i):col_chunk(i)],
                             self.codebook.matrix, y2) for i in range(njb))
        bmu = np.asarray(list(itertools.chain(*b))).T

        del b
        #print "bmu  \n: ", bmu
        return bmu
        print "++++++++++++++++++++++++++++++++++++++++++++"

    @timeit(logging.DEBUG)
    def update_codebook_voronoi(self, training_data, bmu, neighborhood):
        """
        Updates the weights of each node in the codebook that belongs to the
        bmu's neighborhood.

        First finds the Voronoi set of each node. It needs to calculate a
        smaller matrix.
        Super fast comparing to classic batch training algorithm, it is based
        on the implemented algorithm in som toolbox for Matlab by Helsinky
        University.

        :param training_data: input matrix with input vectors as rows and
            vector features as cols
        :param bmu: best matching unit for each input data. Has shape of
            (2, dlen) where first row has bmu indexes
        :param neighborhood: matrix representing the neighborhood of each bmu

        :returns: An updated codebook that incorporates the learnings from the
            input data
        """
        print "--------------------------------------------"
        print "strating of update_codebook_voronoi function ...."
        row = bmu[0].astype(int)
        print "rows  (contain bmu index) :  ", row
        col = np.arange(self._dlen)
        print "coulms (contain data sample no.)  :  ", col
        val = np.tile(1, self._dlen)
        P = csr_matrix((val, (row, col)),
                       shape=(self.codebook.nnodes, self._dlen))
        print "csr matrix in which csr_matrix[k, k]=val[k] : ", P
        S = P.dot(
            training_data
        )  #each neuron has sum of data samples which are matched to that neuron
        print "sparse matrix after multiplying with training data : \n", S

        # neighborhood has nnodes*nnodes and S has nnodes*dim
        # ---> Nominator has nnodes*dim
        nom = neighborhood.T.dot(S)
        print "nominator  : ", nom
        print "P.sum(axis=1) and its shape : ", P.sum(axis=1), P.sum(
            axis=1).shape
        nV = P.sum(axis=1).reshape(
            1, self.codebook.nnodes
        )  #indicate how many data sample match to a paritcular neuron
        print "nV :  ", nV
        print "neighborhood.T  :  \n", neighborhood.T
        print "nV.dot(neighborhood.T) and its shape  \n : ", nV.dot(
            neighborhood.T), nV.dot(neighborhood.T).shape
        denom = nV.dot(neighborhood.T).reshape(self.codebook.nnodes, 1)
        print "denominator  : \n", denom
        new_codebook = np.divide(nom, denom)
        print "new codebook :  \n ", new_codebook
        print "new codebokk round up to decimal 6 : ", np.around(new_codebook,
                                                                 decimals=6)
        return np.around(new_codebook, decimals=6)

    def project_data(self, data):
        """
        Projects a data set to a trained SOM. It is based on nearest
        neighborhood search module of scikitlearn, but it is not that fast.
        """
        clf = neighbors.KNeighborsClassifier(n_neighbors=1)
        #print "clf  :  ", clf
        #print "codebook.matrix.shape[0]  :",self.codebook.matrix.shape[0]
        labels = np.arange(0, self.codebook.matrix.shape[0])
        #print "labels : ",labels
        clf.fit(self.codebook.matrix, labels)

        # The codebook values are all normalized
        # we can normalize the input data based on mean and std of
        # original data
        data = self._normalizer.normalize_by(self.data_raw, data)
        print "------"
        print "normal data : ", clf.predict(data)
        print "++++++"
        return clf.predict(data)

    def predict_by(self, data, target, k=5, wt='distance'):
        # here it is assumed that target is the last column in the codebook
        # and data has dim-1 columns
        dim = self.codebook.matrix.shape[1]
        ind = np.arange(0, dim)
        indX = ind[ind != target]
        x = self.codebook.matrix[:, indX]
        y = self.codebook.matrix[:, target]
        n_neighbors = k
        clf = neighbors.KNeighborsRegressor(n_neighbors, weights=wt)
        clf.fit(x, y)

        # The codebook values are all normalized
        # we can normalize the input data based on mean and std of
        # original data
        dimdata = data.shape[1]

        if dimdata == dim:
            data[:, target] = 0
            data = self._normalizer.normalize_by(self.data_raw, data)
            data = data[:, indX]

        elif dimdata == dim - 1:
            data = self._normalizer.normalize_by(self.data_raw[:, indX], data)

        predicted_values = clf.predict(data)
        predicted_values = self._normalizer.denormalize_by(
            self.data_raw[:, target], predicted_values)
        return predicted_values

    def predict(self, x_test, k=5, wt='distance'):
        """
        Similar to SKlearn we assume that we have X_tr, Y_tr and X_test. Here
        it is assumed that target is the last column in the codebook and data
        has dim-1 columns

        :param x_test: input vector
        :param k: number of neighbors to use
        :param wt: method to use for the weights
            (more detail in KNeighborsRegressor docs)
        :returns: predicted values for the input data
        """
        target = self.data_raw.shape[1] - 1
        x_train = self.codebook.matrix[:, :target]
        y_train = self.codebook.matrix[:, target]
        clf = neighbors.KNeighborsRegressor(k, weights=wt)
        clf.fit(x_train, y_train)

        # The codebook values are all normalized
        # we can normalize the input data based on mean and std of
        # original data
        x_test = self._normalizer.normalize_by(self.data_raw[:, :target],
                                               x_test)
        predicted_values = clf.predict(x_test)

        return self._normalizer.denormalize_by(self.data_raw[:, target],
                                               predicted_values)

    def find_k_nodes(self, data, k=5):
        from sklearn.neighbors import NearestNeighbors
        # we find the k most similar nodes to the input vector
        neighbor = NearestNeighbors(n_neighbors=k, metric='euclidean')
        neighbor.fit(self.codebook.matrix)

        # The codebook values are all normalized
        # we can normalize the input data based on mean and std of
        # original data
        return neighbor.kneighbors(
            self._normalizer.normalize_by(self.data_raw, data))

    def bmu_ind_to_xy(self, bmu_ind):
        """
        Translates a best matching unit index to the corresponding
        matrix x,y coordinates.

        :param bmu_ind: node index of the best matching unit
            (number of node from top left node)
        :returns: corresponding (x,y) coordinate
        """
        rows = self.codebook.mapsize[0]
        cols = self.codebook.mapsize[1]

        # bmu should be an integer between 0 to no_nodes
        out = np.zeros((bmu_ind.shape[0], 3))
        out[:, 2] = bmu_ind
        out[:, 0] = rows - 1 - bmu_ind / cols
        out[:, 0] = bmu_ind / cols
        out[:, 1] = bmu_ind % cols

        return out.astype(int)

    def cluster(self, n_clusters=3):
        import sklearn.cluster as clust
        cl_labels = clust.KMeans(n_clusters=n_clusters).fit_predict(
            self._normalizer.denormalize_by(self.data_raw,
                                            self.codebook.matrix))
        self.cluster_labels = cl_labels
        return cl_labels

    def predict_probability(self, data, target, k=5):
        """
        Predicts probability of the input data to be target

        :param data: data to predict, it is assumed that 'target' is the last
            column in the codebook, so data hould have dim-1 columns
        :param target: target to predict probability
        :param k: k parameter on KNeighborsRegressor
        :returns: probability of data been target
        """
        dim = self.codebook.matrix.shape[1]
        ind = np.arange(0, dim)
        indx = ind[ind != target]
        x = self.codebook.matrix[:, indx]
        y = self.codebook.matrix[:, target]

        clf = neighbors.KNeighborsRegressor(k, weights='distance')
        clf.fit(x, y)

        # The codebook values are all normalized
        # we can normalize the input data based on mean and std of
        # original data
        dimdata = data.shape[1]

        if dimdata == dim:
            data[:, target] = 0
            data = self._normalizer.normalize_by(self.data_raw, data)
            data = data[:, indx]

        elif dimdata == dim - 1:
            data = self._normalizer.normalize_by(self.data_raw[:, indx], data)

        weights, ind = clf.kneighbors(data,
                                      n_neighbors=k,
                                      return_distance=True)
        weights = 1. / weights
        sum_ = np.sum(weights, axis=1)
        weights = weights / sum_[:, np.newaxis]
        labels = np.sign(self.codebook.matrix[ind, target])
        labels[labels >= 0] = 1

        # for positives
        pos_prob = labels.copy()
        pos_prob[pos_prob < 0] = 0
        pos_prob *= weights
        pos_prob = np.sum(pos_prob, axis=1)[:, np.newaxis]

        # for negatives
        neg_prob = labels.copy()
        neg_prob[neg_prob > 0] = 0
        neg_prob = neg_prob * weights * -1
        neg_prob = np.sum(neg_prob, axis=1)[:, np.newaxis]

        return np.concatenate((pos_prob, neg_prob), axis=1)

    def node_activation(self, data, target=None, wt='distance'):
        weights, ind = None, None

        if not target:
            clf = neighbors.KNeighborsClassifier(
                n_neighbors=self.codebook.nnodes)
            labels = np.arange(0, self.codebook.matrix.shape[0])
            clf.fit(self.codebook.matrix, labels)

            # The codebook values are all normalized
            # we can normalize the input data based on mean and std of
            # original data
            data = self._normalizer.normalize_by(self.data_raw, data)
            weights, ind = clf.kneighbors(data)
            print "--------------\n", weights, ind

            # Softmax function
            weights = 1. / weights

        return weights, ind
예제 #13
0
class MaxEnt:
    # -*- mode: Python; coding: utf-8 -*-

    def __init__(self):
        self.parameters = np.zeros((0, 0))

    def train(self,
              instances,
              dev_set=None,
              max_epoch=30,
              learning_rate=.5,
              batch_size=30):
        # Construct a statistical model from labeled instances.

        self.codebook = Codebook()
        self.codebook.supervised_populate(instances)

        self.parameters = np.zeros((self.codebook.dimension()))
        self._train_sgd(instances, dev_set, max_epoch, learning_rate,
                        batch_size)

    def _mini_batch(self, instances, batch_size):
        # Yield mini-batches from the original data

        shuffle(instances)
        for i in range(0, len(instances), batch_size):
            yield instances[i:i + batch_size]

    def _compute_gradient(self, batch):
        # Compute the gradient given the current batch of data

        log_likelihood = 0
        observed_count = np.zeros(self.codebook.dimension())
        expected_count = np.zeros(self.codebook.dimension())

        for datapoint in batch:
            feature_map = [
                self.codebook.feature_index(feature)
                for feature in datapoint.features()
            ]

            observed_count[feature_map,
                           self.codebook.label_index(datapoint.label)] += 1
            lambda_vector = self.parameters[feature_map, :].sum(0)
            log_likelihood -= sum(lambda_vector) - logsumexp(lambda_vector)
            posterior = np.exp(
                lambda_vector[self.codebook.label_index(datapoint.label)] -
                logsumexp(lambda_vector))
            expected_count[
                feature_map,
                self.codebook.label_index(datapoint.label)] += posterior

        return observed_count - expected_count, log_likelihood

    def _train_sgd(self, train_instances, dev_set, max_epoch, learning_rate,
                   batch_size):
        # Train MaxEnt model with Mini-batch Gradient Descent

        for epoch in range(1, max_epoch + 1):
            for batch in self._mini_batch(train_instances, batch_size):
                gradient, log_likelihood = self._compute_gradient(batch)
                self.parameters += gradient * learning_rate
            if dev_set:
                print("(Epoch, accuracy):", (epoch, self.accuracy(dev_set)))

    def accuracy(self, instances):
        # Simple accuracy test for the dev set

        current_state = [self.classify(x) == x.label for x in instances]
        return float(sum(current_state)) / len(current_state)

    def classify(self, instance):
        feature_map = [
            self.codebook.feature_index(feature)
            for feature in instance.features()
            if feature in self.codebook._features2index
        ]

        lambda_vector = self.parameters[feature_map, :].sum(0)
        posteriors = np.exp(lambda_vector - logsumexp(lambda_vector))
        return self.codebook.get_label(np.argmax(posteriors))
예제 #14
0
class ISTDeco:
    '''
    ISTDECO - Deconvovles 1D or 2D spatial transcriptomic data

    Parameters
    ----------
    Y : float
        Input image data of shape (rounds, channels, height, width)
    D : float
        Codebook of shape (ncodes, rounds, channels)
    sigma : tuple(float,float)
        Tuple of values corresponding to the standard deviation
        of the gaussian shaped psf. 
    b : float
        Background offset parameter. Can be a constant or same shape as Y.
        Must be positive. Default: 1e-5
    scale : float
        We can deconvolve the image data to higher/lower spatial resolution.
        Scale determines the scale factor. Defalut: 1.0 (no upscaling)

    Example
    ----------
        model = ISTDeco(Y, D, sigma, b=1e-5, upscale=1)
        model = model.to('cuda') # If we want GPU
        X, Q, loss = model.run(niter=100)
    '''
    def __init__(self, Y, D, sigma, b=1e-8, scale=1):

        self.input_shape = Y.shape
        self.sigma = sigma
        self.scale = scale

        m, r, c = D.shape
        _, _, self.h, self.w = Y.shape
        self.hx = int(np.ceil(self.h * scale)) if self.h > 1 else self.h
        self.wx = int(np.ceil(self.w * scale)) if self.w > 1 else self.w

        bh = int(2 * np.ceil(3 * sigma[0] * scale) + 1) if self.h > 1 else 1
        bw = int(2 * np.ceil(3 * sigma[1] * scale) + 1) if self.w > 1 else 1
        self.psf_support_scaled = (bh, bw)

        bh = int(2 * np.ceil(3 * sigma[0]) + 1) if self.h > 1 else 1
        bw = int(2 * np.ceil(3 * sigma[1]) + 1) if self.w > 1 else 1
        self.psf_support = (bh, bw)

        # Set up X
        x_shape = (m, self.hx, self.wx)
        self.X = torch.ones(x_shape).float()

        # Set up Y
        self.Y = torch.tensor(Y).float().flatten(start_dim=0, end_dim=1)

        # Set up b
        self.b = torch.tensor(b).float()
        if self.b.ndim == 4:
            self.b = self.b.flatten(start_dim=0, end_dim=1)

        # Set up codebook
        self.codebook = Codebook(D)

        # Set up spatial blurr
        self.psf = PSF((self.h, self.w), sigma, scale)

        # Prepare constant
        ones_channels = torch.ones((r * c, 1, 1))
        ones_space = torch.ones((1, self.h, self.w))
        self.denominator =  self.codebook.matmul_t(ones_channels) * \
                            self.psf.matmul_t(ones_space)

        # Compute Yhat = DXG
        self.Yhat = self.__forward(self.X)

    def to(self, device):
        '''
            Puts tensors on a device. See pytorch doc for more info.
            Useful for moving tensors to cuda.

            Example
            ----------
                model = ISTDECO(Y,D,sigma)
                model.to('cuda')    # Put tensors on GPU
                model.to('cpu')     # Put tensors on CPU    

            Parameters
            ----------
                device : str
                    The device, for instance 'cpu' or 'cuda'

        '''
        self.Y = self.Y.to(device)
        self.Yhat = self.Yhat.to(device)
        self.X = self.X.to(device)
        self.denominator = self.denominator.to(device)
        self.codebook = self.codebook.to(device)
        self.psf = self.psf.to(device)
        self.b = self.b.to(device)
        return self

    def __forward(self, tensor):
        return self.psf.matmul(self.codebook.matmul(tensor)) + self.b

    def __compute_quality(self, tensor):
        # Pool intensities spatially

        tensor_blurr = torch.nn.functional.avg_pool2d(tensor, \
            self.psf_support_scaled,\
            stride=1,\
            divisor_override=1,\
            padding=tuple(t//2 for t in self.psf_support_scaled))

        tensor_blurr2 = self.psf.up_pool(
            torch.nn.functional.relu(self.Y - self.b))

        # Compute quality feature
        Q = tensor_blurr / self.codebook.matmul_t(tensor_blurr2)
        Q[torch.isnan(Q)] = 0
        return Q

    def run(self, niter=100, acc=1.0, suppress_radius=1):
        '''
            Run the optimization
            
            Parameters
            ----------
                niter : int 
                    Number of iterations

                acc : float
                    Factor for acceleration the multiplicative updates
                    If too large, a convergence may be unstalbe. Usually a 
                    value between 1.0 and 1.5 is fine. Default: 1.0
                suppress_width : int
                    Integer indicating the radius of the non-maxima supression footprint.
                    Default: 1. 

            Outputs
            ---------
                X : numpy array 
                    A multi-channel image of shape (m, sy, sx) where
                    m is the number of barcodes, sy and sx are the scaled height and width.
                    The values in X corresponds to the intensity of different barcodes.
                    For instance, X_kij is the intensity of the barcode with code k, localized
                    at i and j.

                Q : numpy array
                    A multi-channel image of shape (m, sy, sx) where
                    m is the number of barcodes, sy and sx are the scaled height and width.
                    The values in Q are useful for elimintaing false-positive detections during
                    pos-processing.

                loss : numpy array
                    An (niter,) shaped array with values 
        '''
        loss = torch.zeros((niter, ))
        for i in range(niter):
            loss[i] = torch.sum(self.Yhat) - \
                torch.sum(self.Y*torch.log(self.Yhat+1e-9))
            self.X = self.X * (self.codebook.matmul_t(
                self.psf.matmul_t(self.Y / self.Yhat)) / self.denominator)**acc
            self.Yhat = self.__forward(self.X)
        Q = self.__compute_quality(self.X)
        if suppress_radius is not None:
            mask = self.__nonmaxsuppress(suppress_radius, self.X)
            self.X = self.X * mask
            Q = Q * mask
        return self.X.cpu().numpy(), Q.cpu().numpy(), loss

    def __nonmaxsuppress(self, radius, tensor):
        padd = [radius if self.h > 1 else 0, radius]
        kernel_sz = (2 * radius + 1 if self.h > 1 else 1, 2 * radius + 1)
        mask = torch.nn.functional.max_pool2d(tensor,
                                              kernel_sz,
                                              stride=1,
                                              padding=padd) == tensor
        #ints = torch.nn.functional.avg_pool2d(tensor, kernel_sz, stride=1, padding=padd, divisor_override=1)
        return mask
예제 #15
0
class SOM(object):

    def __init__(self,
                 data,
                 neighborhood,
                 normalizer=None,
                 mapsize=None,
                 mask=None,
                 mapshape='planar',
                 lattice='rect',
                 initialization='pca',
                 training='batch',
                 name='sompy'):
        """
        Self Organizing Map

        :param data: data to be clustered, represented as a matrix of n rows, as inputs and m cols as input features
        :param neighborhood: neighborhood object calculator.
        :param normalizer: normalizer object calculator.
        :param mapsize: tuple/list defining the dimensions of the som. If single number is provided is considered as the number of nodes.
        :param mask: mask
        :param mapshape: shape of the som.
        :param lattice: type of lattice.
        :param initialization: method to be used for initialization of the som.
        :param name: name used to identify the som
        :param training: Training mode (seq, batch)
        """
        self._data = normalizer.normalize(data) if normalizer else data
        self._normalizer = normalizer
        self._dim = data.shape[1]
        self._dlen = data.shape[0]
        self._dlabel = None
        self._bmu = None

        self.name = name
        self.data_raw = data
        self.neighborhood = neighborhood
        self.mapshape = mapshape
        self.initialization = initialization
        self.mask = mask or np.ones([1, self._dim])
        self.codebook = Codebook(mapsize, lattice)
        self.training = training

        self._component_names = self.build_component_names()
        self._distance_matrix = self.calculate_map_dist()
        #self.set_data_labels()  # slow for large data sets

    @property
    def component_names(self):
        return self._component_names

    @component_names.setter
    def component_names(self, compnames):
        if self._dim == len(compnames):
            self._component_names = np.asarray(compnames)[np.newaxis, :]
        else:
            raise ComponentNamesError('Component names should have the same size as the data dimension/features')

    def build_component_names(self):
        cc = ['Variable-' + str(i+1) for i in range(0, self._dim)]
        return np.asarray(cc)[np.newaxis, :]

    @property
    def data_labels(self):
        return self._dlabel

    @data_labels.setter
    def data_labels(self, labels):
        """
        Set labels of the training data, it should be in the format of a list of strings
        """
        if labels.shape == (1, self._dlen):
            label = labels.T
        elif labels.shape == (self._dlen, 1):
            label = labels
        elif labels.shape == (self._dlen,):
            label = labels[:, np.newaxis]
        else:
            raise LabelsError('wrong label format')

        self._dlabel = label

    def build_data_labels(self):
        cc = ['dlabel-' + str(i) for i in range(0, self._dlen)]
        return np.asarray(cc)[:, np.newaxis]

    def calculate_map_dist(self):
        """
        Calculates the grid distance, which will be used during the training steps.
        It supports only planar grids for the moment
        """
        nnodes = self.codebook.nnodes

        distance_matrix = np.zeros((nnodes, nnodes))
        for i in range(nnodes):
            distance_matrix[i] = self.codebook.grid_dist(i).reshape(1, nnodes)

        return distance_matrix

    @timeit()
    def train(self, n_job=1, shared_memory=False, verbose='info'):
        """
        Trains the som

        :param n_job: number of jobs to use to parallelize the traning
        :param shared_memory: flag to active shared memory
        :param verbose: verbosity, could be 'debug', 'info' or None
        """
        logging.root.setLevel(getattr(logging, verbose.upper()) if verbose else logging.ERROR)

        logging.info(" Training...")
        logging.debug((
            "--------------------------------------------------------------\n"
            " details: \n"
            "      > data len is {data_len} and data dimension is {data_dim} \n"
            "      > map size is {mpsz0},{mpsz1}\n"
            "      > array size in log10 scale is {array_size}\n"
            "      > number of jobs in parallel: {n_job}\n"
            " --------------------------------------------------------------\n")
            .format(data_len=self._dlen,
                    data_dim=self._dim,
                    mpsz0=self.codebook.mapsize[0],
                    mpsz1=self.codebook.mapsize[1],
                    array_size=np.log10(self._dlen*self.codebook.nnodes*self._dim),
                    n_job=n_job))

        if self.initialization == 'random':
            self.codebook.random_initialization(self._data)

        elif self.initialization == 'pca':
            self.codebook.pca_linear_initialization(self._data)

        self.rough_train(njob=n_job, shared_memory=shared_memory)
        self.finetune_train(njob=n_job, shared_memory=shared_memory)

        logging.debug(" --------------------------------------------------------------")
        logging.info(" Final quantization error: %f" % np.mean(self._bmu[1]))

    def _calculate_ms_and_mpd(self):
        mn = np.min(self.codebook.mapsize)
        max_s = max(self.codebook.mapsize[0], self.codebook.mapsize[1])

        mpd = float(self.codebook.nnodes*10)/float(self._dlen) if mn == 1 else float(self.codebook.nnodes)/float(self._dlen)
        ms = max_s/2.0 if mn == 1 else max_s

        return ms, mpd

    def rough_train(self, njob=1, shared_memory=False):
        logging.info(" Rough training...")

        ms, mpd = self._calculate_ms_and_mpd()

        trainlen, radiusin, radiusfin = int(np.ceil(30*mpd)), None, None

        if self.initialization == 'random':
            radiusin = max(1, np.ceil(ms/3.))
            radiusfin = max(1, radiusin/6.)

        elif self.initialization == 'pca':
            radiusin = max(1, np.ceil(ms/8.))
            radiusfin = max(1, radiusin/4.)

        self._batchtrain(trainlen, radiusin, radiusfin, njob, shared_memory)

    def finetune_train(self, njob=1, shared_memory=False):
        logging.info(" Finetune training...")

        ms, mpd = self._calculate_ms_and_mpd()

        trainlen, radiusin, radiusfin = None, None, None

        if self.initialization == 'random':
            trainlen = int(np.ceil(50*mpd))
            radiusin = max(1, ms/12.)  # from radius fin in rough training
            radiusfin = max(1, radiusin/25.)

        elif self.initialization == 'pca':
            trainlen = int(np.ceil(40*mpd))
            radiusin = max(1, np.ceil(ms/8.)/4)
            radiusfin = 1  # max(1, ms/128)

        self._batchtrain(trainlen, radiusin, radiusfin, njob, shared_memory)

    def _batchtrain(self, trainlen, radiusin, radiusfin, njob=1, shared_memory=False):
        radius = np.linspace(radiusin, radiusfin, trainlen)

        if shared_memory:
            data = self._data
            data_folder = tempfile.mkdtemp()
            data_name = os.path.join(data_folder, 'data')
            dump(data, data_name)
            data = load(data_name, mmap_mode='r')

        else:
            data = self._data

        bmu = None

        # X2 is part of euclidean distance (x-y)^2 = x^2 +y^2 - 2xy that we use for each data row in bmu finding.
        # Since it is a fixed value we can skip it during bmu finding for each data point,
        # but later we need it calculate quantification error
        fixed_euclidean_x2 = np.einsum('ij,ij->i', data, data)

        logging.info(" radius_ini: %f , radius_final: %f, trainlen: %d\n" % (radiusin, radiusfin, trainlen))

        for i in range(trainlen):
            t1 = time()
            neighborhood = self.neighborhood.calculate(self._distance_matrix, radius[i], self.codebook.nnodes)

            bmu = self.find_bmu(data, njb=njob)
            self.codebook.matrix = self.update_codebook_voronoi(data, bmu, neighborhood)

            qerror = (i+1, round(time() - t1, 3), np.mean(np.sqrt(bmu[1] + fixed_euclidean_x2)))
            logging.info(" epoch: %d ---> elapsed time:  %f, quantization error: %f\n" % qerror)

        bmu[1] = np.sqrt(bmu[1] + fixed_euclidean_x2)
        self._bmu = bmu

    @timeit(logging.DEBUG)
    def find_bmu(self, input_matrix, njb=1):
        """
        Finds the best matching unit (bmu) for each input data from the input matrix. It does all at once parallelizing
        the calculation instead of going through each input and running it against the codebook.

        :param input_matrix: numpy matrix representing inputs as rows and features/dimension as cols
        :param njb: number of jobs to parallelize the search
        :returns: the best matching unit for each input
        """
        dlen = input_matrix.shape[0]
        y2 = np.einsum('ij,ij->i', self.codebook.matrix, self.codebook.matrix)

        parallelizer = Parallel(n_jobs=njb, pre_dispatch='3*n_jobs')
        chunk_bmu_finder = delayed(_chunk_based_bmu_find)

        row_chunk = lambda part: part * dlen // njb
        col_chunk = lambda part: min((part+1)*dlen // njb, dlen)

        b = parallelizer(chunk_bmu_finder(input_matrix[row_chunk(i):col_chunk(i)], self.codebook.matrix, y2) for i in xrange(njb))
        bmu = np.asarray(list(itertools.chain(*b))).T

        del b
        return bmu

    @timeit(logging.DEBUG)
    def update_codebook_voronoi(self, training_data, bmu, neighborhood):
        """
        Updates the weights of each node in the codebook that belongs to the bmu's neighborhood.

        First finds the Voronoi set of each node. It needs to calculate a smaller matrix.
        Super fast comparing to classic batch training algorithm, it is based on the implemented algorithm in
        som toolbox for Matlab by Helsinky university

        :param training_data: input matrix with input vectors as rows and vector features as cols
        :param bmu: best matching unit for each input data. Has shape of (2, dlen) where first row has bmu indexes
        :param neighborhood: matrix representing the neighborhood of each bmu

        :returns: An updated codebook that incorporates the learnings from the input data
        """
        row = bmu[0].astype(int)
        col = np.arange(self._dlen)
        val = np.tile(1, self._dlen)
        P = csr_matrix((val, (row, col)), shape=(self.codebook.nnodes, self._dlen))
        S = P.dot(training_data)

        # neighborhood has nnodes*nnodes and S has nnodes*dim  ---> Nominator has nnodes*dim
        nom = neighborhood.T.dot(S)
        nV = P.sum(axis=1).reshape(1, self.codebook.nnodes)
        denom = nV.dot(neighborhood.T).reshape(self.codebook.nnodes, 1)
        new_codebook = np.divide(nom, denom)

        return np.around(new_codebook, decimals=6)

    def project_data(self, data):
        """
        Projects a data set to a trained SOM. It is based on nearest neighborhood search module of scikitlearn,
        but it is not that fast.
        """
        clf = neighbors.KNeighborsClassifier(n_neighbors=1)
        labels = np.arange(0, self.codebook.matrix.shape[0])
        clf.fit(self.codebook.matrix, labels)

        # The codebook values are all normalized
        # we can normalize the input data based on mean and std of original data
        data = self._normalizer.normalize_by(self.data_raw, data)
        #data = normalize(data, method='var')
        #plt.hist(data[:,2])

        return clf.predict(data)

    def predict_by(self, data, target, k=5, wt='distance'):
        # here it is assumed that target is the last column in the codebook
        # and data has dim-1 columns
        dim = self.codebook.matrix.shape[1]
        ind = np.arange(0, dim)
        indX = ind[ind != target]
        x = self.codebook.matrix[:, indX]
        y = self.codebook.matrix[:, target]
        n_neighbors = k
        clf = neighbors.KNeighborsRegressor(n_neighbors, weights=wt)
        clf.fit(x, y)

        # The codebook values are all normalized
        # we can normalize the input data based on mean and std of original data
        dimdata = data.shape[1]

        if dimdata == dim:
            data[:, target] = 0
            data = self._normalizer.normalize_by(self.data_raw, data)
            data = data[:, indX]

        elif dimdata == dim-1:
            data = self._normalizer.normalize_by(self.data_raw[:, indX], data)

        predicted_values = clf.predict(data)
        predicted_values = self._normalizer.denormalize_by(self.data_raw[:, target], predicted_values)
        return predicted_values

    def predict(self, x_test, k=5, wt='distance'):
        """
        Similar to SKlearn we assume that we have X_tr, Y_tr and X_test. Here it is assumed that target is the last
        column in the codebook and data has dim-1 columns

        :param x_test: input vector
        :param k: number of neighbors to use
        :param wt: method to use for the weights (more detail in KNeighborsRegressor docs)
        :returns: predicted values for the input data
        """
        target = self.data_raw.shape[1]-1
        x_train = self.codebook.matrix[:, :target]
        y_train = self.codebook.matrix[:, target]
        clf = neighbors.KNeighborsRegressor(k, weights=wt)
        clf.fit(x_train, y_train)

        # The codebook values are all normalized
        # we can normalize the input data based on mean and std of original data
        x_test = self._normalizer.normalize_by(self.data_raw[:, :target], x_test)
        predicted_values = clf.predict(x_test)

        return self._normalizer.denormalize_by(self.data_raw[:, target], predicted_values)

    def find_k_nodes(self, data, k=5):
        from sklearn.neighbors import NearestNeighbors
        # we find the k most similar nodes to the input vector
        neighbor = NearestNeighbors(n_neighbors=k)
        neighbor.fit(self.codebook.matrix)

        # The codebook values are all normalized
        # we can normalize the input data based on mean and std of original data
        return neighbor.kneighbors(self._normalizer.normalize_by(self.data_raw, data))

    def bmu_ind_to_xy(self, bmu_ind):
        """
        Translates a best matching unit index to the corresponding matrix x,y coordinates

        :param bmu_ind: node index of the best matching unit (number of node from top left node)
        :returns: corresponding (x,y) coordinate
        """
        rows = self.codebook.mapsize[0]
        cols = self.codebook.mapsize[1]

        # bmu should be an integer between 0 to no_nodes
        out = np.zeros((bmu_ind.shape[0], 3))
        out[:, 2] = bmu_ind
        out[:, 0] = rows-1-bmu_ind / cols
        out[:, 0] = bmu_ind / cols
        out[:, 1] = bmu_ind % cols

        return out.astype(int)

    def cluster(self, n_clusters=8):
        import sklearn.cluster as clust
        return clust.KMeans(n_clusters=n_clusters).fit_predict(self._normalizer.denormalize_by(self.data_raw,
                                                                                               self.codebook.matrix))

    def predict_probability(self, data, target, k=5):
        """
        Predicts probability of the input data to be target

        :param data: data to predict, it is assumed that 'target' is the last column in the codebook,
                     so data hould have dim-1 columns
        :param target: target to predict probability
        :param k: k parameter on KNeighborsRegressor
        :returns: probability of data been target
        """
        dim = self.codebook.matrix.shape[1]
        ind = np.arange(0, dim)
        indx = ind[ind != target]
        x = self.codebook.matrix[:, indx]
        y = self.codebook.matrix[:, target]

        clf = neighbors.KNeighborsRegressor(k, weights='distance')
        clf.fit(x, y)

        # The codebook values are all normalized
        # we can normalize the input data based on mean and std of original data
        dimdata = data.shape[1]

        if dimdata == dim: 
            data[:, target] = 0
            data = self._normalizer.normalize_by(self.data_raw, data)
            data = data[:, indx]

        elif dimdata == dim-1:
            data = self._normalizer.normalize_by(self.data_raw[:, indx], data)

        weights, ind = clf.kneighbors(data, n_neighbors=k, return_distance=True)
        weights = 1./weights
        sum_ = np.sum(weights, axis=1)
        weights = weights/sum_[:, np.newaxis]
        labels = np.sign(self.codebook.matrix[ind, target])
        labels[labels >= 0] = 1

        # for positives
        pos_prob = labels.copy()
        pos_prob[pos_prob < 0] = 0
        pos_prob *= weights
        pos_prob = np.sum(pos_prob, axis=1)[:, np.newaxis]

        # for negatives
        neg_prob = labels.copy()
        neg_prob[neg_prob > 0] = 0
        neg_prob = neg_prob * weights * -1
        neg_prob = np.sum(neg_prob, axis=1)[:, np.newaxis]

        #predicted_values = clf.predict(data)
        #predicted_values = denormalize_by(data_raw[:,Target], predicted_values)
        return np.concatenate((pos_prob, neg_prob), axis=1)

    def node_activation(self, data, target=None, wt='distance'):
        weights, ind = None, None

        if not target:
            clf = neighbors.KNeighborsClassifier(n_neighbors=self.codebook.nnodes)
            labels = np.arange(0, self.codebook.matrix.shape[0])
            clf.fit(self.codebook.matrix, labels)

            # The codebook values are all normalized
            # we can normalize the input data based on mean and std of original data
            data = self._normalizer.normalize_by(self.data_raw, data)
            weights, ind = clf.kneighbors(data)

            # Softmax function
            weights = 1./weights
            #S_  = np.sum(np.exp(weights),axis=1)[:,np.newaxis]
            #weights = np.exp(weights)/S_

        return weights, ind
예제 #16
0
    def codebook(self):
        """ Export a text file code book of categories and codes.
        """

        Codebook(self.app, self.ui.textEdit)
예제 #17
0
파일: sompy.py 프로젝트: dongzhiming/SMEA
    def __init__(self,
                 data,
                 neighborhood,
                 normalizer=None,
                 mapsize=None,
                 mask=None,
                 mapshape='planar',
                 lattice='rect',
                 initialization='pca',
                 training='batch',
                 name='sompy'):
        """
        Self Organizing Map

        :param data: data to be clustered, represented as a matrix of n rows,
            as inputs and m cols as input features
        :param neighborhood: neighborhood object calculator.
        :param normalizer: normalizer object calculator.
        :param mapsize: tuple/list defining the dimensions of the som. If
            single number is provided is considered as the number of nodes.
        :param mask: mask
        :param mapshape: shape of the som.
        :param lattice: type of lattice.
        :param initialization: method to be used for initialization of the som.
        :param name: name used to identify the som
        :param training: Training mode (seq, batch)
        """
        print "DDDD", data
        self._data = data  # normalizer.normalize(data) if normalizer else data
        print "data  : \n", self._data
        print "map size  : ", mapsize

        self._normalizer = normalizer
        print "_normalizer is a object to class normalizer : ", self._normalizer
        self._dim = data.shape[1]
        print "data shape[1]  :  ", self._dim
        self._dlen = data.shape[0]
        print "data shape[0] : ", data.shape[0]
        self._dlabel = None
        print "data label  : ", self._dlabel
        self._bmu = None
        print "intial bmu  :  ", self._bmu
        self.name = name
        print "name   :  ", self.name
        self.data_raw = data
        print "data_raw  : ", self.data_raw
        #print "data_raw  : ", self.data_raw
        self.neighborhood = neighborhood
        self.mapshape = mapshape
        print "mapshape : ", self.mapshape
        self.initialization = initialization
        print "initilization : ", self.initialization
        self.mask = mask or np.ones([1, self._dim])
        print "mask  : ", self.mask
        self.codebook = Codebook(mapsize, lattice)
        print "--------codebook is a object to class Codebook---------\n", self.codebook
        self.training = training
        print "training mode  : ", self.training

        self._component_names = self.build_component_names()
        print "Initial component name  : ", self._component_names
        self._distance_matrix = self.calculate_map_dist()
        print "initial distance matrix  : ", self._distance_matrix
예제 #18
0
class SOM(object):

    def __init__(self,
                 data,
                 neighborhood,
                 normalizer=None,
                 mapsize=None,
                 mask=None,
                 mapshape='planar',
                 lattice='rect',
                 initialization='pca',
                 training='batch',
                 name='sompy',
                 component_names=None):
        """
        Self Organizing Map

        :param data: data to be clustered, represented as a matrix of n rows,
            as inputs and m cols as input features
        :param neighborhood: neighborhood object calculator.
        :param normalizer: normalizer object calculator.
        :param mapsize: tuple/list defining the dimensions of the som. If
            single number is provided is considered as the number of nodes.
        :param mask: mask
        :param mapshape: shape of the som.
        :param lattice: type of lattice.
        :param initialization: method to be used for initialization of the som.
        :param name: name used to identify the som
        :param training: Training mode (seq, batch)
        """
        self._data = normalizer.normalize(data) if normalizer else data
        self._normalizer = normalizer
        self._dim = data.shape[1]
        self._dlen = data.shape[0]
        self._dlabel = None
        self._bmu = None

        self.name = name
        self.data_raw = data
        self.neighborhood = neighborhood
        self.mapshape = mapshape
        self.initialization = initialization
        self.mask = mask or np.ones([1, self._dim])
        mapsize = self.calculate_map_size(lattice) if not mapsize else mapsize
        self.codebook = Codebook(mapsize, lattice)
        self.training = training
        self._component_names = self.build_component_names() if component_names is None else [component_names]
        self._distance_matrix = self.calculate_map_dist()

    @property
    def component_names(self):
        return self._component_names

    @component_names.setter
    def component_names(self, compnames):
        if self._dim == len(compnames):
            self._component_names = np.asarray(compnames)[np.newaxis, :]
        else:
            raise ComponentNamesError('Component names should have the same '
                                      'size as the data dimension/features')

    def build_component_names(self):
        cc = ['Variable-' + str(i+1) for i in range(0, self._dim)]
        return np.asarray(cc)[np.newaxis, :]

    @property
    def data_labels(self):
        return self._dlabel

    @data_labels.setter
    def data_labels(self, labels):
        """
        Set labels of the training data, it should be in the format of a list
        of strings
        """
        if labels.shape == (1, self._dlen):
            label = labels.T
        elif labels.shape == (self._dlen, 1):
            label = labels
        elif labels.shape == (self._dlen,):
            label = labels[:, np.newaxis]
        else:
            raise LabelsError('wrong label format')

        self._dlabel = label

    def build_data_labels(self):
        cc = ['dlabel-' + str(i) for i in range(0, self._dlen)]
        return np.asarray(cc)[:, np.newaxis]

    def calculate_map_dist(self):
        """
        Calculates the grid distance, which will be used during the training
        steps. It supports only planar grids for the moment
        """
        nnodes = self.codebook.nnodes

        distance_matrix = np.zeros((nnodes, nnodes))
        for i in range(nnodes):
            distance_matrix[i] = self.codebook.grid_dist(i).reshape(1, nnodes)

        return distance_matrix

    @timeit()
    def train(self,
              n_job=1,
              shared_memory=False,
              verbose='info',
              train_rough_len=None,
              train_rough_radiusin=None,
              train_rough_radiusfin=None,
              train_finetune_len=None,
              train_finetune_radiusin=None,
              train_finetune_radiusfin=None,
              nth=1):
        """
        Trains the som

        :param n_job: number of jobs to use to parallelize the traning
        :param shared_memory: flag to active shared memory
        :param verbose: verbosity, could be 'debug', 'info' or None
        """
        logging.root.setLevel(
            getattr(logging, verbose.upper()) if verbose else logging.ERROR)

        logging.info(" Training...")
        logging.debug((
            "--------------------------------------------------------------\n"
            " details: \n"
            "      > data len is {data_len} and data dimension is {data_dim}\n"
            "      > map size is {mpsz0},{mpsz1}\n"
            "      > array size in log10 scale is {array_size}\n"
            "      > number of jobs in parallel: {n_job}\n"
            " -------------------------------------------------------------\n")
            .format(data_len=self._dlen,
                    data_dim=self._dim,
                    mpsz0=self.codebook.mapsize[0],
                    mpsz1=self.codebook.mapsize[1],
                    array_size=np.log10(
                        self._dlen * self.codebook.nnodes * self._dim),
                    n_job=n_job))

        if self.initialization == 'random':
            self.codebook.random_initialization(self._data)

        elif self.initialization == 'pca':
            self.codebook.pca_linear_initialization(self._data)

        self.rough_train(njob=n_job, shared_memory=shared_memory, trainlen=train_rough_len,
                         radiusin=train_rough_radiusin, radiusfin=train_rough_radiusfin, nth=nth)
        self.finetune_train(njob=n_job, shared_memory=shared_memory, trainlen=train_finetune_len,
                            radiusin=train_finetune_radiusin, radiusfin=train_finetune_radiusfin, nth=nth)

        logging.debug(
            " --------------------------------------------------------------")
        logging.info(" Final quantization error: %f" % np.mean(self._bmu[1]))

    def _calculate_ms_and_mpd(self):
        mn = np.min(self.codebook.mapsize)
        max_s = max(self.codebook.mapsize[0], self.codebook.mapsize[1])

        if mn == 1:
            mpd = float(self.codebook.nnodes*10)/float(self._dlen)
        else:
            mpd = float(self.codebook.nnodes)/float(self._dlen)
        ms = max_s/2.0 if mn == 1 else max_s

        return ms, mpd

    def rough_train(self, njob=1, shared_memory=False, trainlen=None, radiusin=None, radiusfin=None,nth=1):
        logging.info(" Rough training...")

        ms, mpd = self._calculate_ms_and_mpd()

        trainlen = int(np.ceil(30*mpd)) if not trainlen else trainlen

        if self.initialization == 'random':
            radiusin = max(1, np.ceil(ms/3.)) if not radiusin else radiusin
            radiusfin = max(1, radiusin/6.) if not radiusfin else radiusfin

        elif self.initialization == 'pca':
            radiusin = max(1, np.ceil(ms/8.)) if not radiusin else radiusin
            radiusfin = max(1, radiusin/4.) if not radiusfin else radiusfin

        self._batchtrain(trainlen, radiusin, radiusfin, njob, shared_memory, nth)

    def finetune_train(self, njob=1, shared_memory=False, trainlen=None, radiusin=None, radiusfin=None,nth=1):
        logging.info(" Finetune training...")

        ms, mpd = self._calculate_ms_and_mpd()

        if self.initialization == 'random':
            trainlen = int(np.ceil(50*mpd)) if not trainlen else trainlen
            radiusin = max(1, ms/12.)  if not radiusin else radiusin # from radius fin in rough training
            radiusfin = max(1, radiusin/25.) if not radiusfin else radiusfin

        elif self.initialization == 'pca':
            trainlen = int(np.ceil(40*mpd)) if not trainlen else trainlen
            radiusin = max(1, np.ceil(ms/8.)/4) if not radiusin else radiusin
            radiusfin = 1 if not radiusfin else radiusfin # max(1, ms/128)

        self._batchtrain(trainlen, radiusin, radiusfin, njob, shared_memory, nth)

    def _batchtrain(self, trainlen, radiusin, radiusfin, njob=1,
                    shared_memory=False, nth=1):
        radius = np.linspace(radiusin, radiusfin, trainlen)

        if shared_memory:
            data = self._data
            data_folder = tempfile.mkdtemp()
            data_name = os.path.join(data_folder, 'data')
            dump(data, data_name)
            data = load(data_name, mmap_mode='r')

        else:
            data = self._data

        bmu = None

        # X2 is part of euclidean distance (x-y)^2 = x^2 +y^2 - 2xy that we use
        # for each data row in bmu finding.
        # Since it is a fixed value we can skip it during bmu finding for each
        # data point, but later we need it calculate quantification error
        fixed_euclidean_x2 = np.einsum('ij,ij->i', data, data)

        logging.info(" radius_ini: %f , radius_final: %f, trainlen: %d\n" %
                     (radiusin, radiusfin, trainlen))

        for i in range(trainlen):
            t1 = time()
            neighborhood = self.neighborhood.calculate(
                self._distance_matrix, radius[i], self.codebook.nnodes)

            bmu = self.find_bmu(data, njb=njob, nth=nth)
            self.codebook.matrix = self.update_codebook_voronoi(data, bmu,
                                                                neighborhood)

            qerror = (i + 1, round(time() - t1, 3),
                      np.mean(np.sqrt(bmu[1] + fixed_euclidean_x2)))
            logging.info(
                " epoch: %d ---> elapsed time:  %f, quantization error: %f\n" %
                qerror)

        bmu[1] = np.sqrt(bmu[1] + fixed_euclidean_x2)
        self._bmu = bmu


    @timeit(logging.DEBUG)
    def find_bmu(self, input_matrix, njb=1, nth=1):
        """
        Finds the best matching unit (bmu) for each input data from the input
        matrix. It does all at once parallelizing the calculation instead of
        going through each input and running it against the codebook.

        :param input_matrix: numpy matrix representing inputs as rows and
            features/dimension as cols
        :param njb: number of jobs to parallelize the search
        :returns: the best matching unit for each input
        """
        dlen = input_matrix.shape[0]
        y2 = np.einsum('ij,ij->i', self.codebook.matrix, self.codebook.matrix)
        if njb == -1:
            njb = cpu_count()

        pool = Pool(njb)
        chunk_bmu_finder = _chunk_based_bmu_find

        def row_chunk(part):
            return part * dlen // njb

        def col_chunk(part):
            return min((part+1)*dlen // njb, dlen)

        chunks = [input_matrix[row_chunk(i):col_chunk(i)] for i in range(njb)]
        b = pool.map(lambda chk: chunk_bmu_finder(chk, self.codebook.matrix, y2, nth=nth), chunks)
        pool.close()
        pool.join()
        bmu = np.asarray(list(itertools.chain(*b))).T
        del b
        return bmu

    @timeit(logging.DEBUG)
    def update_codebook_voronoi(self, training_data, bmu, neighborhood):
        """
        Updates the weights of each node in the codebook that belongs to the
        bmu's neighborhood.

        First finds the Voronoi set of each node. It needs to calculate a
        smaller matrix.
        Super fast comparing to classic batch training algorithm, it is based
        on the implemented algorithm in som toolbox for Matlab by Helsinky
        University.

        :param training_data: input matrix with input vectors as rows and
            vector features as cols
        :param bmu: best matching unit for each input data. Has shape of
            (2, dlen) where first row has bmu indexes
        :param neighborhood: matrix representing the neighborhood of each bmu

        :returns: An updated codebook that incorporates the learnings from the
            input data
        """
        row = bmu[0].astype(int)
        col = np.arange(self._dlen)
        val = np.tile(1, self._dlen)
        P = csr_matrix((val, (row, col)), shape=(self.codebook.nnodes,
                       self._dlen))
        S = P.dot(training_data)

        # neighborhood has nnodes*nnodes and S has nnodes*dim
        # ---> Nominator has nnodes*dim
        nom = neighborhood.T.dot(S)
        nV = P.sum(axis=1).reshape(1, self.codebook.nnodes)
        denom = nV.dot(neighborhood.T).reshape(self.codebook.nnodes, 1)
        new_codebook = np.divide(nom, denom)

        return np.around(new_codebook, decimals=6)

    def project_data(self, data):
        """
        Projects a data set to a trained SOM. It is based on nearest
        neighborhood search module of scikitlearn, but it is not that fast.
        """
        clf = neighbors.KNeighborsClassifier(n_neighbors=1)
        labels = np.arange(0, self.codebook.matrix.shape[0])
        clf.fit(self.codebook.matrix, labels)

        # The codebook values are all normalized
        # we can normalize the input data based on mean and std of
        # original data
        data = self._normalizer.normalize_by(self.data_raw, data)

        return clf.predict(data)

    def predict_by(self, data, target, k=5, wt='distance'):
        # here it is assumed that target is the last column in the codebook
        # and data has dim-1 columns
        dim = self.codebook.matrix.shape[1]
        ind = np.arange(0, dim)
        indX = ind[ind != target]
        x = self.codebook.matrix[:, indX]
        y = self.codebook.matrix[:, target]
        n_neighbors = k
        clf = neighbors.KNeighborsRegressor(n_neighbors, weights=wt)
        clf.fit(x, y)

        # The codebook values are all normalized
        # we can normalize the input data based on mean and std of
        # original data
        dimdata = data.shape[1]

        if dimdata == dim:
            data[:, target] = 0
            data = self._normalizer.normalize_by(self.data_raw, data)
            data = data[:, indX]

        elif dimdata == dim-1:
            data = self._normalizer.normalize_by(self.data_raw[:, indX], data)

        predicted_values = clf.predict(data)
        predicted_values = self._normalizer.denormalize_by(
            self.data_raw[:, target], predicted_values)
        return predicted_values

    def predict(self, x_test, k=5, wt='distance'):
        """
        Similar to SKlearn we assume that we have X_tr, Y_tr and X_test. Here
        it is assumed that target is the last column in the codebook and data
        has dim-1 columns

        :param x_test: input vector
        :param k: number of neighbors to use
        :param wt: method to use for the weights
            (more detail in KNeighborsRegressor docs)
        :returns: predicted values for the input data
        """
        target = self.data_raw.shape[1]-1
        x_train = self.codebook.matrix[:, :target]
        y_train = self.codebook.matrix[:, target]
        clf = neighbors.KNeighborsRegressor(k, weights=wt)
        clf.fit(x_train, y_train)

        # The codebook values are all normalized
        # we can normalize the input data based on mean and std of
        # original data
        x_test = self._normalizer.normalize_by(
            self.data_raw[:, :target], x_test)
        predicted_values = clf.predict(x_test)

        return self._normalizer.denormalize_by(
            self.data_raw[:, target], predicted_values)

    def find_k_nodes(self, data, k=5):
        from sklearn.neighbors import NearestNeighbors
        # we find the k most similar nodes to the input vector
        neighbor = NearestNeighbors(n_neighbors=k)
        neighbor.fit(self.codebook.matrix)

        # The codebook values are all normalized
        # we can normalize the input data based on mean and std of
        # original data
        return neighbor.kneighbors(
            self._normalizer.normalize_by(self.data_raw, data))

    def bmu_ind_to_xy(self, bmu_ind):
        """
        Translates a best matching unit index to the corresponding
        matrix x,y coordinates.

        :param bmu_ind: node index of the best matching unit
            (number of node from top left node)
        :returns: corresponding (x,y) coordinate
        """
        rows = self.codebook.mapsize[0]
        cols = self.codebook.mapsize[1]

        # bmu should be an integer between 0 to no_nodes
        out = np.zeros((bmu_ind.shape[0], 3))
        out[:, 2] = bmu_ind
        out[:, 0] = rows-1-bmu_ind / cols
        out[:, 0] = bmu_ind / cols
        out[:, 1] = bmu_ind % cols

        return out.astype(int)

    def cluster(self, n_clusters=8):
        import sklearn.cluster as clust
        cl_labels = clust.KMeans(n_clusters=n_clusters).fit_predict(
            self._normalizer.denormalize_by(self.data_raw,
                                            self.codebook.matrix))
        self.cluster_labels = cl_labels
        return cl_labels

    def predict_probability(self, data, target, k=5):
        """
        Predicts probability of the input data to be target

        :param data: data to predict, it is assumed that 'target' is the last
            column in the codebook, so data hould have dim-1 columns
        :param target: target to predict probability
        :param k: k parameter on KNeighborsRegressor
        :returns: probability of data been target
        """
        dim = self.codebook.matrix.shape[1]
        ind = np.arange(0, dim)
        indx = ind[ind != target]
        x = self.codebook.matrix[:, indx]
        y = self.codebook.matrix[:, target]

        clf = neighbors.KNeighborsRegressor(k, weights='distance')
        clf.fit(x, y)

        # The codebook values are all normalized
        # we can normalize the input data based on mean and std of
        # original data
        dimdata = data.shape[1]

        if dimdata == dim:
            data[:, target] = 0
            data = self._normalizer.normalize_by(self.data_raw, data)
            data = data[:, indx]

        elif dimdata == dim-1:
            data = self._normalizer.normalize_by(self.data_raw[:, indx], data)

        weights, ind = clf.kneighbors(data, n_neighbors=k,
                                      return_distance=True)
        weights = 1./weights
        sum_ = np.sum(weights, axis=1)
        weights = weights/sum_[:, np.newaxis]
        labels = np.sign(self.codebook.matrix[ind, target])
        labels[labels >= 0] = 1

        # for positives
        pos_prob = labels.copy()
        pos_prob[pos_prob < 0] = 0
        pos_prob *= weights
        pos_prob = np.sum(pos_prob, axis=1)[:, np.newaxis]

        # for negatives
        neg_prob = labels.copy()
        neg_prob[neg_prob > 0] = 0
        neg_prob = neg_prob * weights * -1
        neg_prob = np.sum(neg_prob, axis=1)[:, np.newaxis]

        return np.concatenate((pos_prob, neg_prob), axis=1)

    def node_activation(self, data, target=None, wt='distance'):
        weights, ind = None, None

        if not target:
            clf = neighbors.KNeighborsClassifier(
                n_neighbors=self.codebook.nnodes)
            labels = np.arange(0, self.codebook.matrix.shape[0])
            clf.fit(self.codebook.matrix, labels)

            # The codebook values are all normalized
            # we can normalize the input data based on mean and std of
            # original data
            data = self._normalizer.normalize_by(self.data_raw, data)
            weights, ind = clf.kneighbors(data)

            # Softmax function
            weights = 1./weights

        return weights, ind

    def calculate_topographic_error(self):
        bmus1 = self.find_bmu(self.data_raw, njb=1, nth=1)
        bmus2 = self.find_bmu(self.data_raw, njb=1, nth=2)
        bmus_gap = np.abs((self.bmu_ind_to_xy(np.array(bmus1[0]))[:, 0:2] - self.bmu_ind_to_xy(np.array(bmus2[0]))[:, 0:2]).sum(axis=1))
        return np.mean(bmus_gap != 1)

    def calculate_map_size(self, lattice):
        """
        Calculates the optimal map size given a dataset using eigenvalues and eigenvectors. Matlab ported
        :lattice: 'rect' or 'hex'
        :return: map sizes
        """
        D = self.data_raw.copy()
        dlen = D.shape[0]
        dim = D.shape[1]
        munits = np.ceil(5 * (dlen ** 0.5))
        A = np.ndarray(shape=[dim, dim]) + np.Inf

        for i in range(dim):
            D[:, i] = D[:, i] - np.mean(D[np.isfinite(D[:, i]), i])

        for i in range(dim):
            for j in range(dim):
                c = D[:, i] * D[:, j]
                c = c[np.isfinite(c)]
                A[i, j] = sum(c) / len(c)
                A[j, i] = A[i, j]

        VS = np.linalg.eig(A)
        eigval = sorted(np.linalg.eig(A)[0])
        if eigval[-1] == 0 or eigval[-2] * munits < eigval[-1]:
            ratio = 1
        else:
            ratio = np.sqrt(eigval[-1] / eigval[-2])

        if lattice == "rect":
            size1 = min(munits, round(np.sqrt(munits / ratio)))
        else:
            size1 = min(munits, round(np.sqrt(munits / ratio*np.sqrt(0.75))))

        size2 = round(munits / size1)

        return [int(size1), int(size2)]
from codebook import Codebook

codebooks = Codebook.load_codebooks()
Codebook.test_codebooks(codebooks)
예제 #20
0
from codebook import Codebook

codebooks = Codebook.train_codebooks()
Codebook.save_codebooks(codebooks)
예제 #21
0
def build_codebook(encoder, dataset, args):
    embed_bb = args.getboolean('Embedding', 'EMBED_BB')
    codebook = Codebook(encoder, dataset, embed_bb)
    return codebook
예제 #22
0
class SOMMap(object):
    def __init__(self,
                 data,
                 neighborhood,
                 normalizer=None,
                 mapsize=None,
                 mask=None,
                 mapshape='planar',
                 lattice='rect',
                 initialization='pca',
                 training='batch',
                 radius_train='linear',
                 name='sompy',
                 component_names=None,
                 components_to_plot=None,
                 isNormalized=False):
        """
        Self Organizing Map

        :param data: data to be clustered, represented as a matrix of n rows,
            as inputs and m cols as input features
        :param neighborhood: neighborhood object calculator.
        :param normalizer: normalizer object calculator.
        :param mapsize: tuple/list defining the dimensions of the som. If
            single number is provided is considered as the number of nodes.
        :param mask: mask
        :param mapshape: shape of the som.
        :param lattice: type of lattice.
        :param initialization: method to be used for initialization of the som.
        :param name: name used to identify the som
        :param training: Training mode (seq, batch)
        """

        if data is not None:  # ajout LB
            if normalizer and isNormalized == False:
                for i in range(len(normalizer)):
                    data[:, i] = normalizer[i].normalize(data[:, i])
                self._data = data
            else:
                self._data = data
            self._data = self._data.astype('double')  # ajout LB
            self._dim = data.shape[1]
            self._dlen = data.shape[0]
        else:
            self._data = None
            self._dim = None
            self._dlen = None

        self._normalizer = normalizer
        self._dlabel = None
        self._bmu = None

        self.name = name
        self.data_raw = data
        self.neighborhood = neighborhood
        self.mapshape = mapshape
        self.initialization = initialization
        self.mask = mask
        mapsize = self.calculate_map_size(lattice) if not mapsize else mapsize
        self.mapsize = mapsize
        self.codebook = Codebook(mapsize, lattice)
        self.training = training
        self.radius_train = radius_train
        self._component_names = self.build_component_names(
        ) if component_names is None else [component_names]
        self._distance_matrix = self.calculate_map_dist()
        self.components_to_plot = components_to_plot

    def __str__(self):
        return f'mapsize={self.mapsize}\nname={self.name}\nNormaliser={self._normalizer.name}\nMap shape={self.mapshape}'

    def attach_data(self, data, comp_names):  # ajout LB
        self.data_raw = data
        self._dim = data.shape[1]
        self._dlen = data.shape[0]
        self.component_names = comp_names
        self._data = self._normalizer.normalize(data)
        self._data = self._data.astype('double')
        self._bmu = self.find_bmu(data, njb=1)

    def save(self, file):
        dico = {
            'name': self.name,
            'codebook': self.codebook.matrix,
            'lattice': self.codebook.lattice,
            'mapsize': self.codebook.mapsize,
            'normalization': self._normalizer,
            'norm_params': self._normalizer.params,
            'comp_names': self._component_names[0],
            'mask': self.mask,
            'neighborhood': self.neighborhood.name,
            'codebookinitialized': self.codebook.initialized,
            'initialization': self.initialization,
            'bmu': self._bmu,
            'mapshape': self.mapshape,
            'training': self.training,
            'radius_train': self.radius_train,
            'dim': self._dim
        }

        import pickle
        pickle.dump(dico, open(file, 'wb'))

    #def plotepoch(self,comp0,comp1):
    def plotplanes(self):  # ajout LB
        if self.components_to_plot is None:
            return
        comps = self.components_to_plot

        n = {1: (1, 1), 2: (1, 2), 3: (2, 2), 4: (2, 2)}  # lignes, colonnes
        nplots = len(comps)
        if nplots > 4:
            raise ValueError(
                'Le nombre de comp doit etre inferieur ou égal à 4')
        nl = n[nplots][0]
        nc = n[nplots][1]

        neighbours_nodes = self.calculate_neighbours_nodes()
        edges = []
        for i in range(self.codebook.nnodes):
            for j in range(len(neighbours_nodes[i])):
                edges.append((i, neighbours_nodes[i][j]))
        nodes = [i for i in range(self.codebook.nnodes)]
        G = nx.Graph()
        G.add_nodes_from(nodes)
        G.add_edges_from(edges)

        plt.clf()
        for i in range(nplots):
            refs = [self.codebook.matrix[:, comps[i][0]]
                    ], [self.codebook.matrix[:, comps[i][1]]]
            pos = {}
            for k in range(self.codebook.nnodes):
                name = int(k)
                pos[name] = (refs[0][0][k], refs[1][0][k])
            plt.subplot(nl, nc, i + 1)
            plt.scatter(self._data[:, comps[i][0]],
                        self._data[:, comps[i][1]],
                        marker='x',
                        c='b')
            #plt.scatter([self.codebook.matrix[:,comps[i][0]]],[self.codebook.matrix[:,comps[i][1]]],marker='o',c='r')
            nx.draw_networkx(G,
                             pos,
                             arrows=False,
                             with_labels=False,
                             node_size=50)
            plt.xlabel(self._component_names[0][comps[i][0]])
            plt.ylabel(self._component_names[0][comps[i][1]])
            plt.title('comp. {} - {}'.format(comps[i][0], comps[i][1]))
        plt.pause(0.1)

    def plot_tsne(self, init="pca", perplexity=10, verbose=2):
        T_SNE = TSNE(2, init=init, perplexity=perplexity, verbose=verbose)
        x2d = T_SNE.fit_transform(self.codebook.matrix)

        neighbours_nodes = self.calculate_neighbours_nodes()
        edges = []
        for i in range(self.codebook.nnodes):
            for j in range(len(neighbours_nodes[i])):
                edges.append((i, neighbours_nodes[i][j]))
        nodes = [i for i in range(self.codebook.nnodes)]
        G = nx.Graph()
        G.add_nodes_from(nodes)
        for i in range(self.codebook.nnodes):
            G.nodes[i]['label'] = self._nlabel[i]
        G.add_edges_from(edges)

        pos = {}
        for i in range(self.codebook.nnodes):
            name = int(i)
            pos[name] = (x2d[i][0], x2d[i][1])

        colorlist = []
        for i in range(len(np.unique(self._nlabel))):
            colorlist.append('#%06X' % randint(0, 0xFFFFFF))

        plt.figure()
        for i in range(len(np.unique(self._nlabel))):
            nodelist = []
            for j in range(self.codebook.nnodes):
                if G.nodes[j]['label'] == np.unique(self._nlabel)[i]:
                    nodelist.append(j)
            nx.draw_networkx(G,
                             pos,
                             arrows=False,
                             nodelist=nodelist,
                             node_color=colorlist[i],
                             with_labels=False,
                             node_size=50,
                             label=np.unique(self._nlabel)[i])
        plt.xlabel('tsne-2d-one')
        plt.ylabel('tsne-2d-two')
        plt.title('T-SNE des neurones référants')
        plt.show()

    @property
    def component_names(self):
        return self._component_names

    @component_names.setter
    def component_names(self, compnames):

        if self._dim == len(compnames):
            self._component_names = np.asarray(compnames)[np.newaxis, :]
        else:
            raise ComponentNamesError('Component names should have the same '
                                      'size as the data dimension/features')

    def build_component_names(self):
        return ['Variable-' + str(i + 1) for i in range(0, self._dim)]

    @property
    def node_labels(self):
        return self._nlabel

    @node_labels.setter
    def node_labels(self, labels):
        """
        Set labels of the training data, it should be in the format of a list
        of strings
        """

        if labels.shape == (1, self.codebook.nnodes):
            label = labels.T
        elif labels.shape == (self.codebook.nnodes, 1):
            label = labels
        elif labels.shape == (self.codebook.nnodes, ):
            label = labels[:, np.newaxis]
        else:
            raise LabelsError('wrong label format')

        self._nlabel = label

    def build_node_labels(self):
        return ['nlabel-' + str(i) for i in range(0, self.codebook.nnodes)]

    def node_labels_from_data(self, sData):
        nlabels = []
        for i in range(self.codebook.nnodes):
            ind = get_index_positions(self._bmu[0], i)
            if ind != []:
                subData = [sData._dlabel[k] for k in ind]
                nlabels.append(Counter(subData).most_common(1)[0][0])
            else:
                nlabels.append("Nan")
        self._nlabel = nlabels

    def calculate_map_dist(self):  # CALCUL MATRICE DIST
        """
        Calculates the grid distance, which will be used during the training
        steps. It supports only planar grids for the moment
        """
        nnodes = self.codebook.nnodes

        distance_matrix = np.zeros((nnodes, nnodes))
        for i in range(nnodes):
            #distance_matrix[i] = self.codebook.grid_dist(i).reshape(1, nnodes)
            distance_matrix[i] = self.codebook.grid_dist(i).T.reshape(
                1, nnodes)  #attention c'et la distance au carré
        return distance_matrix

    @timeit()
    def train(self,
              n_job=1,
              shared_memory=False,
              verbose='info',
              train_rough_len=None,
              train_rough_radiusin=None,
              train_rough_radiusfin=None,
              train_finetune_len=None,
              train_finetune_radiusin=None,
              train_finetune_radiusfin=None,
              train_len_factor=1,
              maxtrainlen=np.Inf,
              alreadyinit=False,
              watch_evolution=True):
        """
        Trains the som

        :param n_job: number of jobs to use to parallelize the traning
        :param shared_memory: flag to active shared memory
        :param verbose: verbosity, could be 'debug', 'info' or None
        :param train_len_factor: Factor that multiply default training lenghts (similar to "training" parameter in the matlab version). (lbugnon)
        """
        logging.root.setLevel(
            getattr(logging, verbose.upper()) if verbose else logging.ERROR)

        logging.info(" Training...")
        print('Training ...')
        logging.debug((
            "--------------------------------------------------------------\n"
            " details: \n"
            "      > data len is {data_len} and data dimension is {data_dim}\n"
            "      > map size is {mpsz0},{mpsz1}\n"
            "      > array size in log10 scale is {array_size}\n"
            "      > number of jobs in parallel: {n_job}\n"
            " -------------------------------------------------------------\n"
        ).format(data_len=self._dlen,
                 data_dim=self._dim,
                 mpsz0=self.codebook.mapsize[0],
                 mpsz1=self.codebook.mapsize[1],
                 array_size=np.log10(self._dlen * self.codebook.nnodes *
                                     self._dim),
                 n_job=n_job))

        if self.initialization == 'random' and alreadyinit == False:
            self.codebook.random_initialization(self._data)

        elif self.initialization == 'custom' and alreadyinit == False:
            self.codebook.custom_initialization(self._data)

        elif self.initialization == 'pca' and alreadyinit == False:
            self.codebook.pca_linear_initialization(self._data, self.mask)
        elif alreadyinit == False:
            raise AttributeError('initialisation inconnue')
        if train_rough_len > 0:
            self.rough_train(njob=n_job,
                             shared_memory=shared_memory,
                             trainlen=train_rough_len,
                             radiusin=train_rough_radiusin,
                             radiusfin=train_rough_radiusfin,
                             trainlen_factor=train_len_factor,
                             maxtrainlen=maxtrainlen,
                             watch_evolution=watch_evolution)

        if train_finetune_len > 0:
            self.finetune_train(njob=n_job,
                                shared_memory=shared_memory,
                                trainlen=train_finetune_len,
                                radiusin=train_finetune_radiusin,
                                radiusfin=train_finetune_radiusfin,
                                trainlen_factor=train_len_factor,
                                maxtrainlen=maxtrainlen,
                                watch_evolution=watch_evolution)

        if self._bmu is None:  # calcul bmu meme si pas entrainé
            self._bmu = self.find_bmu(self._data, njb=n_job)
        #self.plotplanes2()
        logging.debug(
            " --------------------------------------------------------------")
        logging.info(" Final quantization error: %f" % np.mean(self._bmu[1]))

    def _calculate_ms_and_mpd(self):
        mn = np.min(self.codebook.mapsize)
        max_s = max(self.codebook.mapsize[0], self.codebook.mapsize[1])

        if mn == 1:
            mpd = float(self.codebook.nnodes * 10) / float(self._dlen)
        else:
            mpd = float(self.codebook.nnodes) / float(self._dlen)
        ms = max_s / 2.0 if mn == 1 else max_s

        return ms, mpd

    def rough_train(self,
                    njob=1,
                    shared_memory=False,
                    trainlen=None,
                    radiusin=None,
                    radiusfin=None,
                    trainlen_factor=1,
                    maxtrainlen=np.Inf,
                    watch_evolution=True):
        logging.info(" Rough training...")
        print(" Rough training...")

        ms, mpd = self._calculate_ms_and_mpd()
        #lbugnon: add maxtrainlen
        trainlen = min(int(np.ceil(30 * mpd)),
                       maxtrainlen) if not trainlen else trainlen
        #print("maxtrainlen %d",maxtrainlen)
        #lbugnon: add trainlen_factor
        trainlen = int(trainlen * trainlen_factor)

        if self.initialization == 'random':
            radiusin = max(1, np.ceil(ms / 3.)) if not radiusin else radiusin
            radiusfin = max(1, radiusin / 6.) if not radiusfin else radiusfin

        elif self.initialization == 'pca':
            radiusin = max(1, np.ceil(ms / 8.)) if not radiusin else radiusin
            radiusfin = max(1, radiusin / 4.) if not radiusfin else radiusfin

        self._batchtrain(trainlen,
                         radiusin,
                         radiusfin,
                         njob,
                         shared_memory,
                         watch_evolution=watch_evolution)

    def finetune_train(self,
                       njob=1,
                       shared_memory=False,
                       trainlen=None,
                       radiusin=None,
                       radiusfin=None,
                       trainlen_factor=1,
                       maxtrainlen=np.Inf,
                       watch_evolution=True):
        logging.info(" Finetune training...")
        print('Finetune training')
        ms, mpd = self._calculate_ms_and_mpd()

        #lbugnon: add maxtrainlen
        if self.initialization == 'random':
            trainlen = min(int(np.ceil(50 * mpd)),
                           maxtrainlen) if not trainlen else trainlen
            radiusin = max(
                1, ms / 12.
            ) if not radiusin else radiusin  # from radius fin in rough training
            radiusfin = max(1, radiusin / 25.) if not radiusfin else radiusfin

        elif self.initialization == 'pca':
            trainlen = min(int(np.ceil(40 * mpd)),
                           maxtrainlen) if not trainlen else trainlen
            radiusin = max(1,
                           np.ceil(ms / 8.) / 4) if not radiusin else radiusin
            radiusfin = 1 if not radiusfin else radiusfin  # max(1, ms/128)

        #print("maxtrainlen %d",maxtrainlen)

        #lbugnon: add trainlen_factor
        trainlen = int(trainlen_factor * trainlen)

        self._batchtrain(trainlen,
                         radiusin,
                         radiusfin,
                         njob,
                         shared_memory,
                         watch_evolution=watch_evolution)

    def _batchtrain(self,
                    trainlen,
                    radiusin,
                    radiusfin,
                    njob=1,
                    shared_memory=False,
                    watch_evolution=True):

        if self.radius_train == 'linear':
            radius = np.linspace(radiusin, radiusfin, trainlen)

        elif self.radius_train == 'power_series':
            radius = []
            ratio = radiusfin / radiusin
            for i in range(trainlen):
                radius.append(radiusin * ((ratio)**(i / trainlen)))

        elif self.radius_train == 'inverse_of_time':
            radius = []
            B = trainlen / ((radiusin / radiusfin) - 1)
            A = B * radiusin
            for i in range(trainlen):
                radius.append(A / (i + B))

        else:
            raise AttributeError('évolution du radius inconnue')

        if shared_memory:
            data = self._data
            data_folder = tempfile.mkdtemp()
            data_name = os.path.join(data_folder, 'data')
            dump(data, data_name)
            data = load(data_name, mmap_mode='r')

        else:
            data = self._data

        bmu = None

        # X2 is part of euclidean distance (x-y)^2 = x^2 +y^2 - 2xy that we use
        # for each data row in bmu finding.
        # Since it is a fixed value we can skip it during bmu finding for each
        # data point, but later we need it calculate quantification error

        logging.info(" radius_ini: %f , radius_final: %f, trainlen: %d\n" %
                     (radiusin, radiusfin, trainlen))
        print(
            "radius_ini: {:.3f} , radius_final: {:.3f}, trainlen: {}\n".format(
                radiusin, radiusfin, trainlen))
        for i in range(trainlen):
            t1 = time()
            neighborhood = self.neighborhood.calculate(self._distance_matrix,
                                                       radius[i],
                                                       self.codebook.nnodes)
            bmu = self.find_bmu(data, njb=njob)
            self.codebook.matrix = self.update_codebook_voronoi(
                data, bmu, neighborhood)

            #print('nombre de neurones activés : ',len(np.unique(bmu[0])))
            qerror = self.calculate_quantization_error()
            terror = self.calculate_topographic_error()

            print('Epoch : {} qErr : {:.4f}  tErr : {:.4f}'.format(
                i, qerror, terror))

            logging.info(
                " epoch: {} ---> elapsed time:  {:2.2f}, quantization error: {:2.4f}"
                .format(i,
                        time() - t1, qerror))
            if np.any(np.isnan(qerror)):
                logging.info("nan quantization error, exit train\n")

                #sys.exit("quantization error=nan, exit train")
            if i % 1 == 0 and watch_evolution == True:
                self.plotplanes()  # ajout LB
        if watch_evolution == False:
            self.plotplanes()
        #print('bmu = {}'.format(bmu[1] + fixed_euclidean_x2))
        bmu = self.find_bmu(data,
                            njb=njob)  # ajout LB : il faut remettre à jour
        #tmp= bmu[1] + fixed_euclidean_x2
        #tmp[tmp<0]=0        # ajout LB
        #bmu[1] = np.sqrt(bmu[1] + fixed_euclidean_x2)
        #bmu[1]=tmp
        self._bmu = bmu

    @timeit(logging.DEBUG)
    def find_bmu(self, input_matrix, njb=1, nth=1):
        """
        Finds the best matching unit (bmu) for each input data from the input
        matrix. It does all at once parallelizing the calculation instead of
        going through each input and running it against the codebook.

        :param input_matrix: numpy matrix representing inputs as rows and
            features/dimension as cols
        :param njb: number of jobs to parallelize the search
        :returns: the best matching unit for each input
        """
        dlen = input_matrix.shape[0]
        if self.mask is not None:  # ajout LB
            codebookmask = self.codebook.matrix * self.mask.squeeze()
            datamask = input_matrix * self.mask
        else:
            codebookmask = self.codebook.matrix
            datamask = input_matrix

        y2 = np.einsum('ij,ij->i', self.codebook.matrix,
                       codebookmask)  # somme des carrés Y**2
        x2 = np.einsum('ij,ij->i', datamask, input_matrix)

        if njb == -1:
            njb = cpu_count()

        pool = Pool(njb)
        chunk_bmu_finder = _chunk_based_bmu_find

        def row_chunk(part):
            return part * dlen // njb

        def col_chunk(part):
            return min((part + 1) * dlen // njb, dlen)

        chunks = [input_matrix[row_chunk(i):col_chunk(i)] for i in range(njb)]

        #chunk_bmu_finder(input_matrix, self.codebook.matrix, y2, x2,nth=nth,mask=self.mask)

        b = pool.map(lambda chk: chunk_bmu_finder(
            chk, self.codebook.matrix, y2, x2, nth=nth, mask=self.mask),
                     chunks)  # modif LB

        pool.close()
        pool.join()
        bmu = np.asarray(list(itertools.chain(*b))).T
        del b
        return bmu

    @timeit(logging.DEBUG)
    def update_codebook_voronoi(self, training_data, bmu, neighborhood):
        """
        Updates the weights of each node in the codebook that belongs to the
        bmu's neighborhood.

        First finds the Voronoi set of each node. It needs to calculate a
        smaller matrix.
        Super fast comparing to classic batch training algorithm, it is based
        on the implemented algorithm in som toolbox for Matlab by Helsinky
        University.

        :param training_data: input matrix with input vectors as rows and
            vector features as cols
        :param bmu: best matching unit for each input data. Has shape of
            (2, dlen) where first row has bmu indexes
        :param neighborhood: matrix representing the neighborhood of each bmu

        :returns: An updated codebook that incorporates the learnings from the
            input data
        """
        row = bmu[0].astype(
            int)  # pour chape individu le numero du codebook associé
        col = np.arange(self._dlen)
        val = np.tile(1, self._dlen)  # autant de 1 que d'individus
        P = csr_matrix(
            (val, (row, col)), shape=(self.codebook.nnodes, self._dlen)
        )  # nbr codebook x nbr individus avec des 1 lorsque un individu voisinage du codebook, 0 sinon
        S = P.dot(
            training_data
        )  # nbr codebook x dim codebook : formule 5 page 11 de somtoolbox 5 matlab

        # neighborhood has nnodes*nnodes and S has nnodes*dim
        # ---> Nominator has nnodes*dim
        nom = neighborhood.T.dot(S)
        nV = P.sum(axis=1).reshape(1, self.codebook.nnodes)
        denom = nV.dot(neighborhood.T).reshape(self.codebook.nnodes,
                                               1)  # role de la transposée ???
        new_codebook = np.divide(nom, denom)
        if (denom == 0.0).sum() > 0:
            print('denominateur nul', denom)
            raise
        #return np.around(new_codebook, decimals=6)
        return np.asarray(new_codebook)  # modif LB

    def project_data(self, data, normalize=False):  # modif LB
        """
        Projects a data set to a trained SOM. It is based on nearest
        neighborhood search module of scikitlearn, but it is not that fast.
        """
        clf = neighbors.KNeighborsClassifier(n_neighbors=1)
        labels = np.arange(0, self.codebook.matrix.shape[0])
        clf.fit(self.codebook.matrix, labels)

        # The codebook values are all normalized
        # we can normalize the input data based on mean and std of
        # original data
        if normalize:
            data = self._normalizer.normalize_by(self.data_raw, data)

        return clf.predict(data)

    def predict_by(self, data, target, k=5, wt='distance'):
        # here it is assumed that target is the last column in the codebook
        # and data has dim-1 columns
        print('fonction predict_by est elle utilisée ?')
        raise
        dim = self.codebook.matrix.shape[1]
        ind = np.arange(0, dim)
        indX = ind[ind != target]
        x = self.codebook.matrix[:, indX]
        y = self.codebook.matrix[:, target]
        n_neighbors = k
        clf = neighbors.KNeighborsRegressor(n_neighbors, weights=wt)
        clf.fit(x, y)

        # The codebook values are all normalized
        # we can normalize the input data based on mean and std of
        # original data
        dimdata = data.shape[1]

        if dimdata == dim:
            data[:, target] = 0
            data = self._normalizer.normalize_by(self.data_raw, data)
            data = data[:, indX]

        elif dimdata == dim - 1:
            data = self._normalizer.normalize_by(self.data_raw[:, indX], data)

        predicted_values = clf.predict(data)
        predicted_values = self._normalizer.denormalize_by(
            self.data_raw[:, target], predicted_values)
        return predicted_values

    def predict(self, x_test, k=5, wt='distance'):
        """
        Similar to SKlearn we assume that we have X_tr, Y_tr and X_test. Here
        it is assumed that target is the last column in the codebook and data
        has dim-1 columns

        :param x_test: input vector
        :param k: number of neighbors to use
        :param wt: method to use for the weights
            (more detail in KNeighborsRegressor docs)
        :returns: predicted values for the input data
        """

        target = self.data_raw.shape[1] - 1
        x_train = self.codebook.matrix[:, :target]
        y_train = self.codebook.matrix[:, target]
        clf = neighbors.KNeighborsRegressor(k, weights=wt)
        clf.fit(x_train, y_train)

        # The codebook values are all normalized
        # we can normalize the input data based on mean and std of
        # original data
        x_test = self._normalizer.normalize_by(self.data_raw[:, :target],
                                               x_test)
        predicted_values = clf.predict(x_test)

        return self._normalizer.denormalize_by(self.data_raw[:, target],
                                               predicted_values)

    def find_k_nodes(self, data, k=5):
        from sklearn.neighbors import NearestNeighbors
        # we find the k most similar nodes to the input vector
        neighbor = NearestNeighbors(n_neighbors=k)
        neighbor.fit(self.codebook.matrix)

        # The codebook values are all normalized
        # we can normalize the input data based on mean and std of
        # original data
        return neighbor.kneighbors(
            self._normalizer.normalize_by(self.data_raw, data))

    def bmu_ind_to_xy(self, bmu_ind):
        """
        Translates a best matching unit index to the corresponding
        matrix x,y coordinates.

        :param bmu_ind: node index of the best matching unit
            (number of node from top left node)
        :returns: corresponding (x,y) coordinate
        """
        rows = self.codebook.mapsize[0]
        cols = self.codebook.mapsize[1]

        # bmu should be an integer between 0 to no_nodes
        out = np.zeros((bmu_ind.shape[0], 3))
        out[:, 2] = bmu_ind
        out[:, 0] = rows - 1 - bmu_ind / cols
        out[:, 0] = bmu_ind / cols
        out[:, 1] = bmu_ind % cols

        return out.astype(int)

    def cluster(self, n_clusters=8):
        import sklearn.cluster as clust
        #cl_labels = clust.KMeans(n_clusters=n_clusters).fit_predict(
        #    self._normalizer.denormalize_by(self.data_raw,
        #                                    self.codebook.matrix))
        cl_labels = clust.KMeans(n_clusters=n_clusters).fit_predict(
            self.codebook.matrix)

        #print(cl_labels)
        self.cluster_labels = cl_labels
        return cl_labels

    def predict_probability(self, data, target, k=5):
        """
        Predicts probability of the input data to be target

        :param data: data to predict, it is assumed that 'target' is the last
            column in the codebook, so data hould have dim-1 columns
        :param target: target to predict probability
        :param k: k parameter on KNeighborsRegressor
        :returns: probability of data been target
        """
        dim = self.codebook.matrix.shape[1]
        ind = np.arange(0, dim)
        indx = ind[ind != target]
        x = self.codebook.matrix[:, indx]
        y = self.codebook.matrix[:, target]

        clf = neighbors.KNeighborsRegressor(k, weights='distance')
        clf.fit(x, y)

        # The codebook values are all normalized
        # we can normalize the input data based on mean and std of
        # original data
        dimdata = data.shape[1]

        if dimdata == dim:
            data[:, target] = 0
            data = self._normalizer.normalize_by(self.data_raw, data)
            data = data[:, indx]

        elif dimdata == dim - 1:
            data = self._normalizer.normalize_by(self.data_raw[:, indx], data)

        weights, ind = clf.kneighbors(data,
                                      n_neighbors=k,
                                      return_distance=True)
        weights = 1. / weights
        sum_ = np.sum(weights, axis=1)
        weights = weights / sum_[:, np.newaxis]
        labels = np.sign(self.codebook.matrix[ind, target])
        labels[labels >= 0] = 1

        # for positives
        pos_prob = labels.copy()
        pos_prob[pos_prob < 0] = 0
        pos_prob *= weights
        pos_prob = np.sum(pos_prob, axis=1)[:, np.newaxis]

        # for negatives
        neg_prob = labels.copy()
        neg_prob[neg_prob > 0] = 0
        neg_prob = neg_prob * weights * -1
        neg_prob = np.sum(neg_prob, axis=1)[:, np.newaxis]

        return np.concatenate((pos_prob, neg_prob), axis=1)

    def node_activation(self, data, target=None, wt='distance'):
        weights, ind = None, None

        if not target:
            clf = neighbors.KNeighborsClassifier(
                n_neighbors=self.codebook.nnodes)
            labels = np.arange(0, self.codebook.matrix.shape[0])
            clf.fit(self.codebook.matrix, labels)

            # The codebook values are all normalized
            # we can normalize the input data based on mean and std of
            # original data
            data = self._normalizer.normalize_by(self.data_raw, data)
            weights, ind = clf.kneighbors(data)

            # Softmax function
            weights = 1. / weights

        return weights, ind

    def calculate_topographic_error(self):
        bmus1 = self.find_bmu(self.data_raw, njb=1, nth=1)
        bmus2 = self.find_bmu(self.data_raw, njb=1, nth=2)
        topographic_error = None
        if self.codebook.lattice == "rect":
            bmus_gap = np.abs(
                (self.bmu_ind_to_xy(np.array(bmus1[0]))[:, 0:2] -
                 self.bmu_ind_to_xy(np.array(bmus2[0]))[:, 0:2]).sum(axis=1))
            topographic_error = np.mean(bmus_gap != 1)
        elif self.codebook.lattice == "hexa":
            dist_matrix_1 = self.codebook.lattice_distances[bmus1[0].astype(
                int)].reshape(len(bmus1[0]), -1)
            topographic_error = (np.array([
                distances[bmu2]
                for bmu2, distances in zip(bmus2[0].astype(int), dist_matrix_1)
            ]) > 2).mean()
        return (topographic_error)

    def calculate_quantization_error(self):
        neuron_values = self.codebook.matrix[self.find_bmu(
            self._data)[0].astype(int)]
        quantization_error = np.mean(
            np.abs(neuron_values - self._data))  # norme L1 et pas L2 ????
        return quantization_error

    def calculate_map_size(self, lattice):
        """
        Calculates the optimal map size given a dataset using eigenvalues and eigenvectors. Matlab ported
        :lattice: 'rect' or 'hex'
        :return: map sizes
        """
        D = self.data_raw.copy()
        dlen = D.shape[0]
        dim = D.shape[1]
        munits = np.ceil(5 * (dlen**0.5))
        A = np.ndarray(shape=[dim, dim]) + np.Inf

        for i in range(dim):
            D[:, i] = D[:, i] - np.mean(D[np.isfinite(D[:, i]), i])

        for i in range(dim):
            for j in range(dim):
                c = D[:, i] * D[:, j]
                c = c[np.isfinite(c)]
                A[i, j] = sum(c) / len(c)
                A[j, i] = A[i, j]

        VS = np.linalg.eig(A)
        eigval = sorted(VS[0])
        if eigval[-1] == 0 or eigval[-2] * munits < eigval[-1]:
            ratio = 1
        else:
            ratio = np.sqrt(eigval[-1] / eigval[-2])

        if lattice == "rect":
            size1 = min(munits, round(np.sqrt(munits / ratio)))
        else:
            size1 = min(munits, round(np.sqrt(munits / ratio * np.sqrt(0.75))))

        size2 = round(munits / size1)

        return [int(size1), int(size2)]

    def calculate_neighbours_nodes(self):
        res = []
        for i in range(self.codebook.nnodes):
            current = []
            for j in range(self.codebook.nnodes):
                if (self._distance_matrix[i][j] < 1.5
                        and self._distance_matrix[i][j] != 0):
                    current.append(j)
            res.append(current)
        return res
예제 #23
0
from codebook import Codebook
import cv2 as cv
import numpy as np

FILE_NAME = "./examples/img.png"

img = cv.imread(FILE_NAME, cv.CV_8UC1)
count = Codebook.count_people(img)
print(count)
예제 #24
0
    def __init__(self,
                 data,
                 neighborhood,
                 normalizer=None,
                 mapsize=None,
                 mask=None,
                 mapshape='planar',
                 lattice='rect',
                 initialization='pca',
                 training='batch',
                 radius_train='linear',
                 name='sompy',
                 component_names=None,
                 components_to_plot=None,
                 isNormalized=False):
        """
        Self Organizing Map

        :param data: data to be clustered, represented as a matrix of n rows,
            as inputs and m cols as input features
        :param neighborhood: neighborhood object calculator.
        :param normalizer: normalizer object calculator.
        :param mapsize: tuple/list defining the dimensions of the som. If
            single number is provided is considered as the number of nodes.
        :param mask: mask
        :param mapshape: shape of the som.
        :param lattice: type of lattice.
        :param initialization: method to be used for initialization of the som.
        :param name: name used to identify the som
        :param training: Training mode (seq, batch)
        """

        if data is not None:  # ajout LB
            if normalizer and isNormalized == False:
                for i in range(len(normalizer)):
                    data[:, i] = normalizer[i].normalize(data[:, i])
                self._data = data
            else:
                self._data = data
            self._data = self._data.astype('double')  # ajout LB
            self._dim = data.shape[1]
            self._dlen = data.shape[0]
        else:
            self._data = None
            self._dim = None
            self._dlen = None

        self._normalizer = normalizer
        self._dlabel = None
        self._bmu = None

        self.name = name
        self.data_raw = data
        self.neighborhood = neighborhood
        self.mapshape = mapshape
        self.initialization = initialization
        self.mask = mask
        mapsize = self.calculate_map_size(lattice) if not mapsize else mapsize
        self.mapsize = mapsize
        self.codebook = Codebook(mapsize, lattice)
        self.training = training
        self.radius_train = radius_train
        self._component_names = self.build_component_names(
        ) if component_names is None else [component_names]
        self._distance_matrix = self.calculate_map_dist()
        self.components_to_plot = components_to_plot