Example #1
0
    def __init__(self, side, dimension, lc=None, uc=None):
        self.side = side # side length of the grid
        self.dimension = dimension # size of our grid vectors
        self.bestMatchCoords = [] # x/y coords of classified vectors

        self.radius = float(side)/2 # the radius of neighbour nodes which will be influenced by each new training vector

        # initialise the nodes to random values between 0 -> 1
        self.weights = TM(self.side, dimension=self.dimension, randomize=True)

        # cutoff for turning the VS_flat into a boundary mask
        # this is a magic number, but it seems to work OK
        self.maskCutoff = 16
        self.boundaryMask = np.zeros((self.side,self.side))
        self.VS_flat = np.zeros((self.side,self.side))

        # bin assignments
        self.binAssignments = np.zeros((self.side,self.side)) # the 0 bin is 'not assigned'

        if lc is not None:
            diff = uc - lc
            self.weights.nodes *= uc
            self.weights.nodes += lc
        self.regions = None

        # we'd like to know who is next to whom
        self.regionNeighbours = {}
Example #2
0
class SOM:
    """A single instance of a self organising map"""
    def __init__(self, side, dimension, lc=None, uc=None):
        self.side = side # side length of the grid
        self.dimension = dimension # size of our grid vectors
        self.bestMatchCoords = [] # x/y coords of classified vectors

        self.radius = float(side)/2 # the radius of neighbour nodes which will be influenced by each new training vector

        # initialise the nodes to random values between 0 -> 1
        self.weights = TM(self.side, dimension=self.dimension, randomize=True)

        # cutoff for turning the VS_flat into a boundary mask
        # this is a magic number, but it seems to work OK
        self.maskCutoff = 16
        self.boundaryMask = np.zeros((self.side,self.side))
        self.VS_flat = np.zeros((self.side,self.side))

        # bin assignments
        self.binAssignments = np.zeros((self.side,self.side)) # the 0 bin is 'not assigned'

        if lc is not None:
            diff = uc - lc
            self.weights.nodes *= uc
            self.weights.nodes += lc
        self.regions = None

        # we'd like to know who is next to whom
        self.regionNeighbours = {}

    def getWeights(self):
        """Get the weights nodes"""
        return self.weights.nodes

    def getRegions(self):
        """Get the regions nodes"""
        return self.regions.nodes

    def loadWeights(self, nodes):
        """Use externally supplied data"""
        self.weights.nodes = nodes

    def loadRegions(self, nodes):
        """Use externally supplied regions"""
        self.regions = TM(self.side, dimension=1)
        self.regions.nodes = nodes

#------------------------------------------------------------------------------
# CLASSIFICATION

    # Classify!
    # run this after training to get an X/Y for each vector
    # you'd like to classify
    def classify(self, point):
        """Classify an individual point"""
        (row,col) = self.weights.bestMatch(point)
        return self.regions.nodes[row,col][0]

    def regionalise(self, bids, trainVector):
        """Create regions on the torus based on matches with the training vector
        Train vector is a list of numpy arrays
        """
        # build a regions structure
        self.regions = TM(self.side, dimension=1)
        for row in range(self.side):
            for col in range(self.side):
                self.regions.nodes[row,col] = self.classifyPoint(self.weights.nodes[row,col],
                                                                 trainVector,
                                                                 bids)

    def classifyPoint(self, point, trainVector, bids):
        """Returns the bid of the best match to the trainVector
        trainVector and bids must be in sync
        """
        return bids[np.argmin((((trainVector - point)**2).sum(axis=1))**0.5)]

    def findRegionNeighbours(self):
        """Find out which regions neighbour which other regions"""
        neighbours = {}
        self.regionNeighbours = {}
        for row in range(self.side-1):
            for col in range(self.side-1):
                s_bid = self.regions.nodes[row,col][0]
                # test against right, down and diagonally down
                q_bids = [self.regions.nodes[row,col+1][0],
                          self.regions.nodes[row+1,col][0],
                          self.regions.nodes[row+1,col+1][0]
                         ]
                for q_bid in q_bids:
                    if(s_bid != q_bid):
                        # make storage for this region
                        if(s_bid not in self.regionNeighbours):
                            self.regionNeighbours[s_bid] = []
                        if(q_bid not in self.regionNeighbours):
                            self.regionNeighbours[q_bid] = []
                        # add in the neighbours
                        if(q_bid not in self.regionNeighbours[s_bid]):
                            self.regionNeighbours[s_bid].append(q_bid)
                        if(s_bid not in self.regionNeighbours[q_bid]):
                            self.regionNeighbours[q_bid].append(s_bid)

                        # we only need to return a tuple
                        nt = self.makeNTuple(s_bid,q_bid)
                        neighbours[nt] = True
        return neighbours.keys()

    def makeNTuple(self, bid1, bid2):
        """A way for making standard tuples from bids"""
        if(bid1 < bid2): return (bid1, bid2)
        return (bid2, bid1)

    def getNeighbours(self, bids):
        """return the neighbours of these bids"""
        ret_list = []
        for bid in bids:
            if(bid in self.regionNeighbours):
                ret_list.extend([i for i in self.regionNeighbours[bid] if i not in ret_list])
        return ret_list

#------------------------------------------------------------------------------
# TRAINING

    def train(self,
              trainVector,
              weights=None,
              iterations=1000,
              vectorSubSet=1000,
              weightImgFileNamePrefix="",
              epsilom=0.0001,
              influenceRate=0.4,
              mask=None,
              radius=0.,
              silent=True
              ):
        """Train the SOM
        Train vector is a list of numpy arrays
        """

        if not silent:
            print "    Start SOM training. Side: %d Max: %d iterations" % (self.side, iterations)

        if radius == 0.0:
            radius = self.radius

        # we can use a dummy set of weights, or the *true* weights
        if weights is None:
            replace_weights = True
            weights = self.weights.nodes
            flat_nodes = self.weights.flatNodes
            rows = self.side
            cols = self.side
        else:
            shape = np.shape(weights)
            rows = shape[0]
            cols = shape[1]
            flat_nodes = weights.reshape((rows*cols, self.dimension))
            replace_weights = False

        # over time we'll shrink the radius of nodes which
        # are influenced by the current training node
        time_constant = iterations/log(radius)

        # we would ideally like to select guys from the training set at random
        if(len(trainVector) <= vectorSubSet):
            index_array = np.arange(len(trainVector))
            cut_off = len(trainVector) # if less than 1000 training vectors, set this to suit
        else:
            rand_index_array = np.arange(len(trainVector))
            cut_off = vectorSubSet

        for i in range(1, iterations+1):
            if not silent:
                sys.stdout.write("\r    Iteration: % 4d of % 4d" % (i, iterations))
                sys.stdout.flush()

#--------
# Make stamp
            # gaussian decay on radius and amount of influence
            radius_decaying=radius*exp(-1.0*i/time_constant)
            if(radius_decaying < 2):
                return weights

            grad = -1 * influenceRate / radius_decaying
            # we will make a "stamp" to speed things up
            max_radius = int(radius_decaying)
            q_stamp = np.zeros((max_radius+1,max_radius+1))

            for row in range(0, max_radius+1):
                for col in range(0, max_radius + 1):
                    # now we check to see that the euclidean distance is less than
                    # the specified distance.
                    true_dist = np.sqrt( row**2 + col**2 )
                    #if true_dist > 0.0:
                    check_dist = np.round(true_dist+0.00001)
                    if(check_dist <= radius_decaying):
                        # influence is propotional to distance
                        influence = true_dist*grad + influenceRate
                        q_stamp[row, col] = influence
                        q_stamp[col, row] = influence

            # divide by 2, so we don't mess up the stamp
            q_stamp[:,0] /= 2
            q_stamp[0,:] /= 2
            stamp = np.zeros((2*max_radius+1,2*max_radius+1))
            # bottom right
            stamp[max_radius:,max_radius:] += q_stamp
            # top right
            stamp[:max_radius+1,max_radius:] += np.rot90(q_stamp,1)
            # top left
            stamp[:max_radius+1,:max_radius+1] += np.rot90(q_stamp,2)
            # bottom left
            stamp[max_radius:,:max_radius+1] += np.rot90(q_stamp,3)
            # center
            stamp[max_radius, max_radius] = influenceRate


            # now find where the useless info is and cull it from the stamp
            max_vals = np.max(stamp, axis=0)
            k = 0
            while k < len(max_vals):
                if max_vals[k] > 0.0:
                    break
                k += 1
            if k < len(max_vals):
                stamp = stamp[k:2*max_radius+1-k,k:2*max_radius+1-k]

            # keep track of how big the stamp is now
            stamp_side = len(stamp)
            stamp_radius = int((stamp_side-1)/2)

            # if there are more than vectorSubSet training vecs
            # take a random selection
            if(len(trainVector) > vectorSubSet):
                np.random.shuffle(rand_index_array)
                index_array = rand_index_array[:cut_off]
#--------
# Make worksheet
            worksheet = np.zeros(self.dimension*rows*cols*9).reshape((rows*3,
                                                                      cols*3,
                                                                      self.dimension))
            worksheet[0:rows,0:cols] = weights
            worksheet[0:rows,cols:cols*2] = weights
            worksheet[0:rows,cols*2:cols*3] = weights
            worksheet[rows:rows*2,0:cols] = weights
            worksheet[rows:rows*2,cols:cols*2] = weights
            worksheet[rows:rows*2,cols*2:cols*3] = weights
            worksheet[rows*2:rows*3,0:cols] = weights
            worksheet[rows*2:rows*3,cols:cols*2] = weights
            worksheet[rows*2:rows*3,cols*2:cols*3] = weights

            # make a set of "delta nodes"
            # these contain the changes to the set of grid nodes
            # and we will add their values to the grid nodes
            # once we have input all the training nodes
            deltasheet = np.zeros_like(worksheet)

            for j in index_array:
                # find the best match between then training vector and the
                # current grid, inlined for greater speed
                loc = np.argmin(cdist(flat_nodes, [trainVector[j]]))
                row = int(loc/cols)
                col = loc-(row*cols)

                # row col represent the center of the stamp
                weights_patch = worksheet[rows+row-stamp_radius:rows+row+stamp_radius+1,
                                          cols+col-stamp_radius:cols+col+stamp_radius+1]
                weights_patch = -1*(weights_patch - trainVector[j])
                #print row, col, rows, cols, stamp_radius, np.shape(weights_patch), np.shape(weights_patch[:,:,0]), np.shape(stamp), np.shape(deltasheet[rows+row-stamp_radius:rows+row+stamp_radius+1,cols+col-stamp_radius:cols+col+stamp_radius+1])
                weights_patch[:,:,0] *= stamp
                weights_patch[:,:,1] *= stamp
                weights_patch[:,:,2] *= stamp
                weights_patch[:,:,3] *= stamp

                deltasheet[rows+row-stamp_radius:rows+row+stamp_radius+1,
                           cols+col-stamp_radius:cols+col+stamp_radius+1] += weights_patch

            # now fold the deltas and update the weights
            deltasheet[:,cols:2*cols] += deltasheet[:,0:cols]
            deltasheet[:,cols:2*cols] += deltasheet[:,2*cols:3*cols]
            deltasheet[rows:2*rows,cols:2*cols] += deltasheet[0:rows,cols:2*cols]
            deltasheet[rows:2*rows,cols:2*cols] += deltasheet[2*rows:3*rows,cols:2*cols]

            # add the deltas to the grid nodes and clip to keep between 0 and 1
            if mask is None:
                weights = np.clip(weights + deltasheet[rows:2*rows,cols:2*cols], 0, 1)
            else:
                delta_fold = deltasheet[rows:2*rows,cols:2*cols]
                for (r,c) in mask.keys():
                    weights[r,c] = np.clip(weights[r,c] + delta_fold[r,c], 0, 1)
                    flat_nodes = weights.reshape((rows*cols, self.dimension))

            if replace_weights == True:
                flat_nodes = self.weights.fixFlatNodes(weights=weights)

            # make a tmp image, perhaps
            if(weightImgFileNamePrefix != ""):
                filename = "%s_%04d.jpg" % (weightImgFileNamePrefix, i)
                print " writing: %s" % filename
                self.weights.renderSurface(filename)

        return weights

    def makeBoundaryMask(self, plotMaskFile=""):
        """Make a mask for cutting out boundaries"""
        # First create the mask
        VS = self.weights.buildVarianceSurface()
#        self.VS_flat = np.array([[int(j) for j in i] for i in np.array(VS[:,:,0] + VS[:,:,1] + VS[:,:,2] + VS[:,:,3])*250]).reshape((self.side, self.side))
        self.VS_flat = np.array(VS[:,:,0] + VS[:,:,1] + VS[:,:,2] + VS[:,:,3]).reshape((self.side, self.side)) * 250
        self.boundaryMask = np.where(self.VS_flat > self.maskCutoff, 1., 0.)

        if plotMaskFile != "":
            self.renderBoundaryMask(plotMaskFile)

    def maskBoundaries(self, addNoise=False, weights=None, mask=None, doFlat=False):
        """mask boundaries and add some random noise to
        some non-masked areas if asked to"""
        max_noise = 0.1
        noise_targets = 3
        if weights is None:
            weights = self.weights.nodes
            rows = self.side
            cols = self.side
        else:
            shape = np.shape(weights)
            rows = shape[0]
            cols = shape[1]

        if mask is None:
            mask = self.boundaryMask
        for r in range(rows):
            for c in range(cols):
                if mask[r,c] == 1:
                    # on the boundary, mask as -1's
                    weights[r,c] = [-1.]*self.dimension
                elif addNoise:
                    if randint(10) <= noise_targets:
                        # add some noise
                        noise_amount = random() * max_noise + 1.0
                        weights[r,c] *= noise_amount
        if doFlat:
            self.weights.fixFlatNodes()

    def defineBinRegions(self, bids, binProfiles, render=False):
        """Work out which bins go where"""
        rcols = {}
        rand_col_lower = 15
        rand_col_upper = 200
        bp_map = {}

        # use a flood fill algorithm to color in adjacent spots
        # and assign bins to unmasked points
        for i in range(len(bids)):
            # find out where this bin matches bestest
            [row, col] = self.weights.bestMatch(binProfiles[i])
            bid = bids[i]
            bp_map[bid] = binProfiles[i]
            rcols[bid] = (randrange(rand_col_lower, rand_col_upper),
                          randrange(rand_col_lower, rand_col_upper),
                          randrange(rand_col_lower, rand_col_upper)
                         )
            self.expandAssign(row, col, bid, bp_map)
        if render:
            self.renderBoundaryMask("S3.png", colMap=rcols)

        # now clean up the mask
        for r in range(self.side):
            for c in range(self.side):
                if self.boundaryMask[r,c] == 0 and self.binAssignments[r,c] == 0:
                    # unmasked AND unassigned
                    self.boundaryMask[r,c] = 1
        if render:
            self.renderBoundaryMask("S4.png", colMap=rcols)

    def expandAssign(self, startR, startC, bid, binProfileMap):
        """Based on floodfill, add more points to a bin assignment"""
        # get all the points within this region
        points = self.floodFill(startR, startC, self.boundaryMask)
        collision_bid = 0
        for (r,c) in points.keys():
            if self.binAssignments[r,c] != 0:
                if self.binAssignments[r,c] != bid:
                    # we have already assigned this point to a bin
                    # most likely we need to set the mask cutoff higher, but just for this region
                    collision_bid = self.binAssignments[r,c]
                    #print "\n", (r,c), "already assigned to bin %d, trying to reassign to bin %d" % (collision_bid, bid)
                    re_calc_mask = True
                    break
            self.binAssignments[r,c] = bid

        if collision_bid != 0:
            resolved = False
            [crow, ccol] = self.weights.bestMatch(binProfileMap[collision_bid])   # where the old bin's floodfill started
            mc = self.maskCutoff
            # we can't do anything if we can't lower the cutoff...
            while mc >= 2:
                # rebuild the mask with a new cutoff
                mc = mc/2
                mask = np.copy(self.boundaryMask)
                for (r,c) in points.keys():
                    if self.VS_flat[r,c] > mc:
                        mask[r,c] = 1.
                    else:
                        mask[r,c] = 0.
                #self.renderBoundaryMask("MASK_%d_%d_%f.png" % (bid, collision_bid, mc), mask)
                collision_points = self.floodFill(crow, ccol, mask)
                new_points = self.floodFill(startR, startC, mask)
                #print len(collision_points.keys()), len(new_points.keys())
                #print collision_points.keys()[0] in new_points
                if len(collision_points.keys()) == 0 or len(new_points.keys()) == 0:
                    continue
                # there should be no overlap
                if collision_points.keys()[0] not in new_points:
                    # we have resolved the issue
                    resolved = True
                    # now we need to fix the binAssignments and boundary mask
                    self.boundaryMask = mask
                    for (r,c) in points.keys():
                        if (r,c) in new_points:
                            # assign this point to the new bid
                            self.binAssignments[r,c] = bid
                        elif (r,c) in collision_points:
                            # point in the new mask
                            self.binAssignments[r,c] = collision_bid
                        else:
                            self.binAssignments[r,c] = 0.
                    break

            if not resolved:
                print "Cannot repair map, bin %d may be incorrectly merged with bin %d" % (bid, collision_bid)
                return

    def makeBinMask(self, profile, fileName="", dim=False):
        """Return a mask of the region this profile falls in"""
        [r, c] = self.weights.bestMatch(profile)
        points = self.floodFill(r, c, self.boundaryMask)
        if fileName != "":
            ret_mask = np.ones_like(self.boundaryMask)
            for (r,c) in points.keys():
                ret_mask[r,c] = 0
            self.renderBoundaryMask(fileName, mask=ret_mask)

        return points

    def floodFill(self, startR, startC, mask):
        """Return all points affected by a flood fill operation at the given point"""
        points = {}
        toFill = set()
        toFill.add((startR, startC))
        seen = {(startR, startC) : True}
        while len(toFill) != 0:
            (r,c) = toFill.pop()
            if mask[r,c] == 1:
                # we are at the boundary of a region
                continue

            points[(r,c)] = [r,c]

            # don't forget we're on a torus
            if r == 0: rm1 = self.side - 1
            else: rm1 = r - 1
            if c == 0: cm1 = self.side - 1
            else: cm1 = c - 1
            if r == self.side - 1: rp1 = 0
            else: rp1 = r + 1
            if c == self.side - 1:cp1 = 0
            else: cp1 = c + 1
            if (rm1,c) not in seen: toFill.add((rm1,c)); seen[(rm1,c)] = True
            if (rp1,c) not in seen: toFill.add((rp1,c)); seen[(rp1,c)] = True
            if (r,cm1) not in seen: toFill.add((r,cm1)); seen[(r,cm1)] = True
            if (r,cp1) not in seen: toFill.add((r,cp1)); seen[(r,cp1)] = True

        return points

    def secondsToStr(self, t):
        rediv = lambda ll,b : list(divmod(ll[0],b)) + ll[1:]
        return "%d:%02d:%02d.%03d" % tuple(reduce(rediv,[[t*1000,],1000,60,60]))

#------------------------------------------------------------------------------
# CLASSIFICATION

    def classifyContig(self, profile):
        """Classify this contig"""
        [r,c] = self.weights.bestMatch(profile)
        return int(self.binAssignments[r,c])

#------------------------------------------------------------------------------
# IO and IMAGE RENDERING

    def renderWeights(self, tag, weights=None):
        """Render the surface weights"""
        filename = tag+".png"
        if weights is None:
            self.weights.renderSurface(filename)
        else:
            self.weights.renderSurface(filename, nodes=weights)

    def renderRegions(self, tag, palette):
        """Render the regions
        palette is a hash of bid -> color
        """
        filename = tag+".png"
        if(self.regions is None):
            raise ge.RegionsDontExistException
        try:
            img = Image.new("RGB", (self.weights.rows,self.regions.columns))
            for row in range(self.side):
                for col in range(self.side):
                    img.putpixel((col,row), palette[self.regions.nodes[row,col][0]])
            img = img.resize((self.weights.columns*10, self.weights.rows*10),Image.NEAREST)
            img.save(filename)
        except:
            print sys.exc_info()[0]
            raise

    def renderBoundaryMask(self, fileName, mask=None, colMap=None):
        """Plot the boundary mask"""
        if mask is None:
            mask = self.boundaryMask
        try:
            img = Image.new("RGB", (self.side, self.side))
            for r in range(self.side):
                for c in range(self.side):
                    if mask[r,c] == 0:
                        if colMap is not None:
                            try:
                                col = colMap[self.binAssignments[r,c]]
                            except KeyError:
                                col = (255,255,255)
                        else:
                            col = (255,255,255)
                        img.putpixel((c,r), col)
                    else:
                        img.putpixel((c,r), (0,0,0))
            img = img.resize((self.side*10, self.side*10),Image.NEAREST)
            img.save(fileName)
        except:
            print sys.exc_info()[0]
            raise

    def transColour(self, val):
        """Transform color value"""
        return 10 * log(val)

    def renderBestMatches(self, fileName, weighted=False):
        """make an image of where best matches lie
        set weighted to use a heatmap view of where they map
        """
        img_points = np.zeros((self.weights.rows,self.weights.columns))
        try:
            img = Image.new("RGB", (self.weights.columns, self.weights.rows))
            if(weighted): # color points by bestmatch density
                max = 0
                for point in self.bestMatchCoords:
                    img_points[point[0],point[1]] += 1
                    if(max < img_points[point[0],point[1]]):
                        max = img_points[point[0],point[1]]
                max += 1
                resolution = 200
                if(max < resolution):
                    resolution = max - 1
                max = self.transColour(max)
                rainbow = Rainbow(0, max, resolution, "gbr")
                for point in self.bestMatchCoords:
                    img.putpixel((point[1],point[0]), rainbow.getColour(self.transColour(img_points[point[0],point[1]])))
            else: # make all best match points white
                for point in self.bestMatchCoords:
                    img.putpixel((point[1],point[0]), (255,255,255))
            img = img.resize((self.weights.columns*10, self.weights.rows*10),Image.NEAREST)
            img.save(fileName)
        except:
            print sys.exc_info()[0]
            raise