def main():
    p = opt.ArgumentParser(description="""
            Computes textural tissue descriptors from an RGB image (of an H&E slide).
            """)
    p.add_argument('img_file', action='store', help='RGB image file of an H&E slide')
    p.add_argument('out_file', action='store', default='descriptors.dat',
                   help='Name of the result file')

    # p.add_argument('model_file', action='store', help='Models file')
    p.add_argument('--scale', action='store', type=float, default=1.0,
                   help='Scale of the image at which the descriptors are computed (default: 1.0)')
    p.add_argument('--ngl', type=int, default=16, action='store',
                   help='Number of grey levels in H- and E-images (default: 16)')
    p.add_argument('--wsize', action='store', type=int, default=50,
                   help='Sliding window size (default: 50)')
    p.add_argument('--mask', action='store_true',
                   help='')


    args = p.parse_args()
    img_file = args.img_file
    # model_file = args.model_file
    n_grey_levels = args.ngl
    w_size = args.wsize
    scale = args.scale
    out_file = args.out_file

    base_name = os.path.basename(img_file).split('.')
    if len(base_name) > 1:             # at least 1 suffix .ext
        base_name.pop()                # drop the extension
        base_name = '.'.join(base_name)  # reassemble the rest of the list into file name

    img = skimage.io.imread(img_file)

    # with ModelPersistence(model_file, 'r', format='pickle') as d:
    #    rgb_models = d['models']

    img_h, img_e   = rgb2he(img, normalize=True)
    img_h          = requantize(img_h, nlevels=n_grey_levels, method='linear')
    img_e          = requantize(img_e, nlevels=n_grey_levels, method='linear')

    G = GaborDescriptor()
    if args.mask:
        mask, _ = tissue_region_from_rgb(img, _min_area=150)
        g_h = get_gabor_desc(img_h, G, w_size, scale, mask)
        g_e = get_gabor_desc(img_e, G, w_size, scale, mask)
    else:
        g_h = get_gabor_desc(img_h, G, w_size, scale)
        g_e = get_gabor_desc(img_e, G, w_size, scale)

    with open(out_file, 'w') as f:
        for d in g_h:
            f.write('\t'.join(str(x) for x in d))
            f.write('\n')
        for d in g_e:
            f.write('\t'.join(str(x) for x in d))
            f.write('\n')

    return
Example #2
0
def main():
    p = opt.ArgumentParser(description="""
            Extracts the Haematoxylin and Eosin components from an RGB image (of an H&E slide).
            """)
    p.add_argument('img_file', action='store', help='RGB image file')
    p.add_argument('--prefix', action='store',
                   help='optional prefix for the result files: prefix_[h|e].type',
                   default=None)
    p.add_argument('--histeq', action='store_true',
                   help='requests for histogram equalization of the results')
    p.add_argument('--meta', action='store_true',
                   help='store meta information associated with the results')

    args = p.parse_args()
    img_file = args.img_file

    base_name = os.path.basename(img_file).split('.')
    if len(base_name) > 1:             # at least 1 suffix .ext
        base_name.pop()                # drop the extension
        base_name = '.'.join(base_name)  # reassemble the rest of the list into file name

    if args.prefix is not None:
        pfx = args.prefix
    else:
        pfx = base_name

    img = skimage.io.imread(img_file)

    img_h, img_e = rgb2he(img)

    if args.histeq:
        img_h = skimage.exposure.equalize_hist(img_h)
        img_e = skimage.exposure.equalize_hist(img_e)

    skimage.io.imsave(pfx + '_h.pgm', img_h)
    skimage.io.imsave(pfx + '_e.pgm', img_e)

    if args.meta:
        r = ET.Element('meta', attrib={'processor':'wsi_he'})
        t = ET.SubElement(r, 'file')
        t.text = img_file
        t = ET.SubElement(r, 'parameters')
        t1 = ET.SubElement(t, 'prefix')
        t1.text = args.prefix
        t1 = ET.SubElement(t, 'histeq')
        t1.text = str(args.histeq)
        t = ET.SubElement(r, 'outfile')
        t.text = pfx + '_h.pgm'
        t = ET.SubElement(r, 'outfile')
        t.text = pfx + '_e.pgm'
    
        raw_txt = ET.tostring(r, 'utf-8')
        reparsed = minidom.parseString(raw_txt)
        pp_txt = reparsed.toprettyxml(indent='  ')
        meta_file = open(pfx+'_he.meta.xml', 'w')
        meta_file.write(pp_txt)
        
    return
Example #3
0
def main():
    p = opt.ArgumentParser(description="""
            Classifies regions of an image (based on SURF) using a pre-built model (codebook).
            """)
    p.add_argument('in_file', action='store', help='image file name')
    p.add_argument('out_file', action='store', help='file to store the resulting classification')
    p.add_argument('model', action='store', help='file containing the model')
    p.add_argument('-a', '--annot', action='store', help='annotation file name', default=None)
    p.add_argument('-t', '--threshold', action='store', type=int, default=5000,
                   help='Hessian threshold for SURF features.')
    p.add_argument('-x', action='store', help='image name with patches classified', default=None)
    p.add_argument('-v', '--verbose', action='store_true', help='verbose?')
    
    args = p.parse_args()
    th = args.threshold
    
    if args.verbose:
        print("Image:", args.in_file)
        
    img = cv2.imread(args.in_file)
    mask = None
    
    with ModelPersistence(args.model, 'r', format='pickle') as mp:
        codebook = mp['codebook']
        Xm = mp['shift']
        Xs = mp['scale']
        standardize = mp['standardize']
        avg_dist = mp['avg_dist_to_centroid']
        sd_dist = mp['stddev_dist_to_centroid']
        
    if args.annot is not None:
        coords = np.fromfile(args.annot, dtype=int, sep=' ')  # x y - values
        coords = np.reshape(coords, (coords.size/2, 2), order='C')
        # get the bounding box:
        xmin, ymin = coords.min(axis=0)
        xmax, ymax = coords.max(axis=0)
        img = img[ymin:ymax+3, xmin:xmax+3, :]            # keep only the region of interest

        if args.verbose:
            print("\t...building mask")
        
        mask = np.zeros(img.shape[0:2], dtype=np.uint8)
        r, c = skimage.draw.polygon(coords[:,1]-ymin, coords[:,0]-xmin) # adapt to new image...
        mask[r,c] = 1                                         # everything outside the region is black

    if args.verbose:
        print("\t...H&E extraction")

    img_h, _ = rgb2he(img, normalize=True)                # get the H- component
    img_h = equalize_adapthist(img_h)
    img_h = rescale_intensity(img_h, out_range=(0,255))
    
    # make sure the dtype is right for image and the mask: OpenCV is sensitive to data type
    img_h = img_h.astype(np.uint8)

    if mask is not None:
        img_h *= mask
    
    if args.verbose:
        print("\t...feature detection and computation")
    
    feat = cv2.xfeatures2d.SURF_create(hessianThreshold=th)
    keyp, desc = feat.detectAndCompute(img_h, mask)
    
    if args.verbose:
        print("\t...", str(len(keyp)), "features extracted")
        
    X = np.hstack(desc)
    X = np.reshape(X, (len(desc), desc[0].size), order='C')
    if standardize:
        # make sure each variable (column) is mean-centered and has unit standard deviation
        X = (X - Xm) / Xs
        
    if args.verbose:
        print("\t...classification")
        
    # instead of the following, allow for "no label":
    y0 = codebook.predict(X).tolist()
    y = np.zeros(X.shape[0], dtype=np.int) - 1
    d = np.zeros((X.shape[0], codebook.cluster_centers_.shape[0]))
    
    for k in range(0, codebook.cluster_centers_.shape[0]):
        d[:, k] = np.linalg.norm(X - codebook.cluster_centers_[k, :], axis=1)

    for i in range(0, d.shape[0]):
        # find the closest centroid among those that have a distance < 3*SD
        j = np.where(d[i, :] < avg_dist + 3.0*sd_dist)[0]
        if np.any(j):
            y[i] = j[np.argmin(d[i, j])]    # the label of the closest centroid
    
    #if np.any(y < 0):
    #    y = y[y >= 0]
        
    if args.verbose:
        print("\t...of", str(X.shape[0]), "patches,", str(y.size), "where assigned a label")
    with open(args.out_file, mode='w') as fout:
        for k in range(len(y)):
            s = '\t'.join([str(np.round(keyp[k].pt[0])), str(np.round(keyp[k].pt[1])),
                           str(np.round(keyp[k].size)), str(y[k]), str(y0[k])]) + '\n'
            fout.write(s)

    if args.x is not None:
        # construct a representation of the image based on the class labels
        img = adjust_gamma(img, 0.2)             # dim the image
        for k in range(len(y)):
            x, y = keyp[k].pt
            x = int(np.round(x))
            y = int(np.round(y))
            r = int(np.round(keyp[k].size))
            img[y-int(r/2):y+int(r/2), x-int(r/2):x+int(r/2), :] = (10, (10+2*k)%256, k%256)
        cv2.imwrite(args.x, img)
Example #4
0
def main():
    p = opt.ArgumentParser(description="""
            Extracts the Haematoxylin and Eosin components from an RGB image (of an H&E slide).
            """)
    p.add_argument('img_file', action='store', help='RGB image file')
    p.add_argument(
        '--prefix',
        action='store',
        help='optional prefix for the result files: prefix_[h|e].type',
        default=None)
    p.add_argument('--histeq',
                   action='store_true',
                   help='requests for histogram equalization of the results')
    p.add_argument('--meta',
                   action='store_true',
                   help='store meta information associated with the results')

    args = p.parse_args()
    img_file = args.img_file

    base_name = os.path.basename(img_file).split('.')
    if len(base_name) > 1:  # at least 1 suffix .ext
        base_name.pop()  # drop the extension
        base_name = '.'.join(
            base_name)  # reassemble the rest of the list into file name

    if args.prefix is not None:
        pfx = args.prefix
    else:
        pfx = base_name

    img = skimage.io.imread(img_file)

    img_h, img_e = rgb2he(img)

    if args.histeq:
        img_h = skimage.exposure.equalize_hist(img_h)
        img_e = skimage.exposure.equalize_hist(img_e)

    skimage.io.imsave(pfx + '_h.pgm', img_h)
    skimage.io.imsave(pfx + '_e.pgm', img_e)

    if args.meta:
        r = ET.Element('meta', attrib={'processor': 'wsi_he'})
        t = ET.SubElement(r, 'file')
        t.text = img_file
        t = ET.SubElement(r, 'parameters')
        t1 = ET.SubElement(t, 'prefix')
        t1.text = args.prefix
        t1 = ET.SubElement(t, 'histeq')
        t1.text = str(args.histeq)
        t = ET.SubElement(r, 'outfile')
        t.text = pfx + '_h.pgm'
        t = ET.SubElement(r, 'outfile')
        t.text = pfx + '_e.pgm'

        raw_txt = ET.tostring(r, 'utf-8')
        reparsed = minidom.parseString(raw_txt)
        pp_txt = reparsed.toprettyxml(indent='  ')
        meta_file = open(pfx + '_he.meta.xml', 'w')
        meta_file.write(pp_txt)

    return
def main():
    p = opt.ArgumentParser(description="""
    Assigns the regions of an image to the clusters of a codebook.
    """)
    p.add_argument('image', action='store', help='image file name')
    p.add_argument('model', action='store', help='model file name')
    p.add_argument('out_file', action='store', help='results file name')
    p.add_argument('-r', '--roi', action='store', nargs=4, type=int,
                   help='region of interest from the image as: row_min row_max col_min col_max',
                   default=None)
    args = p.parse_args()

    wsize = 32
    tmp  = np.array([0.0, np.pi / 4.0, np.pi / 2.0, 3.0 * np.pi / 4.0],
        dtype=np.double)
    tmp2 = np.array([3.0 / 4.0, 3.0 / 8.0, 3.0 / 16.0], dtype=np.double)
    tmp3 = np.array([1.0, 2 * np.sqrt(2.0)], dtype=np.double)

    desc = GaborDescriptor(theta=tmp, freq=tmp2, sigma=tmp3)

    image = skimage.io.imread(args.image)
    if image.ndim == 3:
        im_h, _ = rgb2he(image, normalize=True)
        im_h = equalize_adapthist(im_h)
        im_h = rescale_intensity(im_h, out_range=(0,255))
        im_h = im_h.astype(np.uint8)
        image = im_h
        im_h = None

    if args.roi is None:
        roi = (0, image.shape[0]-1, 0, image.shape[1]-1)
    else:
        roi = args.roi

    with ModelPersistence(args.model, 'r', format='pickle') as mp:
        codebook = mp['codebook']
        avg_dist = None
        sd_dist = None
        if 'avg_dist_to_centroid' in mp:
            avg_dist = mp['avg_dist_to_centroid']
        if 'stddev_dist_to_centroid' in mp:
            sd_dist = mp['stddev_dist_to_centroid']


    itw = sliding_window_on_regions(image.shape, [tuple(roi)], (wsize,wsize), step=(wsize,wsize))
    wnd = []
    labels = []
    dists = []
    buff_size = 100                  # every <buff_size> patches we do a classification
    X = np.zeros((buff_size, codebook.cluster_centers_[0].shape[0]))

    k = 0
    for r in itw:
        # adjust if needed:
        r2 = (r[0], r[1], r[2], r[3])
        wnd.append(r2)
        X[k,:] = desc.compute(image[r[0]:r[1], r[2]:r[3]])
        k += 1
        if k == buff_size:
            y = codebook.predict(X)
            Z = codebook.transform(X)
            labels.extend(y.tolist())
            dists.extend(Z[np.arange(buff_size), y].tolist())  # get the distances to the centroids of the assigned clusters
            k = 0                      # reset the block

    if k != 0:
        # it means some data is accumulated in X but not yet classified
        y = codebook.predict(X[0:k,])
        Z = codebook.transform(X[0:k,])
        labels.extend(y.tolist())
        dists.extend(Z[np.arange(k), y].tolist())  # get the distances to the centroids of the assigned clusters

    # save data
    with open(args.out_file, 'w') as f:
        n = len(wnd)                       # total number of descriptors of this type
        for k in range(n):
            s = '\t'.join([str(x_) for x_ in wnd[k]]) + '\t' + str(labels[k]) + \
                '\t' + str(dists[k]) + '\n'
            f.write(s)
Example #6
0
def main():
    p = opt.ArgumentParser(description="""
            Extracts features from annotated regions and constructs a codebook of a given size.
            """)
    p.add_argument('in_file', action='store', help='a file with pairs of image and annotation files')
    p.add_argument('out_file', action='store', help='resulting model file name')
    p.add_argument('codebook_size', action='store', help='codebook size', type=int)
    p.add_argument('-t', '--threshold', action='store', type=int, default=5000,
                   help='Hessian threshold for SURF features.')
    p.add_argument('-s', '--standardize', action='store_true', default=False,
                   help='should the features be standardized before codebook construction?')
    p.add_argument('-x', action='store_true', help='save the image patches closes to the code blocks?')
    p.add_argument('-v', '--verbose', action='store_true', help='verbose?')
    
    args = p.parse_args()
    th = args.threshold
    
    all_key_points, all_descriptors, all_image_names = [], [], []
    all_roi = []
    with open(args.in_file, mode='r') as fin:
        for l in fin.readlines():
            l = l.strip()
            if len(l) == 0:
                break
            img_file, annot_file = [z_ for z_ in l.split()][0:2]  # file names: image and its annotation
            
            if args.verbose:
                print("Image:", img_file)
                
            img = cv2.imread(img_file)
            coords = np.fromfile(annot_file, dtype=int, sep=' ')  # x y - values
            coords = np.reshape(coords, (coords.size/2, 2), order='C')
            # get the bounding box:
            xmin, ymin = coords.min(axis=0)
            xmax, ymax = coords.max(axis=0)

            if args.verbose:
                print("\t...H&E extraction")

            img = img[ymin:ymax+2, xmin:xmax+2, :]                # keep only the region of interest
            img_h, _ = rgb2he(img, normalize=True)                # get the H- component
            img_h = equalize_adapthist(img_h)
            img_h = rescale_intensity(img_h, out_range=(0,255))
            
            # make sure the dtype is right for image and the mask: OpenCV is sensitive to data type
            img_h = img_h.astype(np.uint8)

            if args.verbose:
                print("\t...building mask")
                
            mask = np.zeros(img_h.shape, dtype=np.uint8)
            r, c = skimage.draw.polygon(coords[:,1]-ymin, coords[:,0]-xmin) # adapt to new image...
            mask[r,c] = 1                                         # everything outside the region is black
            
            if args.verbose:
                print("\t...feature detection and computation")
            
            img_h *= mask
            feat = cv2.xfeatures2d.SURF_create(hessianThreshold=th)
            keyp, desc = feat.detectAndCompute(img_h, mask)
            
            if args.verbose:
                print("\t...", str(len(keyp)), "features extracted")
                
            all_descriptors.extend(desc)
            if args.x:
                # only needed if saving patches:
                all_key_points.extend(keyp)
                all_image_names.extend([img_file] * len(keyp))
                all_roi.extend([(xmin, xmax, ymin, ymax)] * len(keyp))
        # end for
        
    if args.verbose:
        print("\nK-means clustering")
        
    X = np.hstack(all_descriptors)
    X = np.reshape(X, (len(all_descriptors), all_descriptors[0].size), order='C')
    if args.standardize:
        # make sure each variable (column) is mean-centered and has unit standard deviation
        Xm = np.mean(X, axis=0)
        Xs = np.std(X, axis=0)
        Xs[np.isclose(Xs, 1e-16)] = 1.0
        X = (X - Xm) / Xs
    
    if args.verbose:
        print("\t...with", str(X.shape[0]), "points")
        
    rng = np.random.RandomState(0)
    vq = MiniBatchKMeans(n_clusters=args.codebook_size, random_state=rng,
                         batch_size=500, compute_labels=True, verbose=False)   # vector quantizer

    vq.fit(X)

    # compute the average distance and std.dev. of the points in each cluster:
    avg_dist = np.zeros(args.codebook_size)
    sd_dist = np.zeros(args.codebook_size)
    for k in range(0, args.codebook_size):
        d = numpy.linalg.norm(X[vq.labels_ == k, :] - vq.cluster_centers_[k, :], axis=1)
        avg_dist[k] = d.mean()
        sd_dist[k] = d.std()
        
    with ModelPersistence(args.out_file, 'c', format='pickle') as d:
        d['codebook'] = vq
        d['shift'] = Xm
        d['scale'] = Xs
        d['standardize'] = args.standardize
        d['avg_dist_to_centroid'] = avg_dist
        d['stddev_dist_to_centroid'] = sd_dist

    if args.x:
        # find the closest patches to each centroid:
        idx = np.zeros(args.codebook_size, dtype=np.int)
        d = np.zeros(X.shape[0])
        for k in range(0, args.codebook_size):
            for i in range(0, X.shape[0]):
                d[i] = numpy.linalg.norm(X[i,:] - vq.cluster_centers_[k,:])
            idx[k] = d.argmin()        # the index of the closest patch to k-th centroid
        for k in range(0, args.codebook_size):
            i = idx[k]
            x, y = all_key_points[i].pt
            x = int(np.round(x))
            y = int(np.round(y))
            r = all_key_points[i].size   # diameter of the region
            img = cv2.imread(all_image_names[i])
            print("Image:", all_image_names[i],
                  "\tPatch (row_min->max, col_min->max):",
                  str(y+all_roi[i][2]-int(r/2)),
                  str(y+all_roi[i][2]+int(r/2)),
                  str(x+all_roi[i][0]-int(r/2)),
                  str(x+all_roi[i][0]+int(r/2)))
            patch = img[y+all_roi[i][2]-int(r/2):y+all_roi[i][2]+int(r/2),
                        x+all_roi[i][0]-int(r/2):x+all_roi[i][0]+int(r/2), :]
            cv2.imwrite('codeblock_'+str(k)+'.png', patch)
            
    return True
Example #7
0
def main():
    p = opt.ArgumentParser(description="""
            Computes textural tissue descriptors from an RGB image (of an H&E slide).
            """)
    p.add_argument('img_file',
                   action='store',
                   help='RGB image file of an H&E slide')
    p.add_argument('out_file',
                   action='store',
                   default='descriptors.dat',
                   help='Name of the result file')

    # p.add_argument('model_file', action='store', help='Models file')
    p.add_argument(
        '--scale',
        action='store',
        type=float,
        default=1.0,
        help=
        'Scale of the image at which the descriptors are computed (default: 1.0)'
    )
    p.add_argument(
        '--ngl',
        type=int,
        default=16,
        action='store',
        help='Number of grey levels in H- and E-images (default: 16)')
    p.add_argument('--wsize',
                   action='store',
                   type=int,
                   default=50,
                   help='Sliding window size (default: 50)')
    p.add_argument('--mask', action='store_true', help='')

    args = p.parse_args()
    img_file = args.img_file
    # model_file = args.model_file
    n_grey_levels = args.ngl
    w_size = args.wsize
    scale = args.scale
    out_file = args.out_file

    base_name = os.path.basename(img_file).split('.')
    if len(base_name) > 1:  # at least 1 suffix .ext
        base_name.pop()  # drop the extension
        base_name = '.'.join(
            base_name)  # reassemble the rest of the list into file name

    img = skimage.io.imread(img_file)

    # with ModelPersistence(model_file, 'r', format='pickle') as d:
    #    rgb_models = d['models']

    img_h, img_e = rgb2he(img, normalize=True)
    img_h = requantize(img_h, nlevels=n_grey_levels, method='linear')
    img_e = requantize(img_e, nlevels=n_grey_levels, method='linear')

    G = GaborDescriptor()
    if args.mask:
        mask, _ = tissue_region_from_rgb(img, _min_area=150)
        g_h = get_gabor_desc(img_h, G, w_size, scale, mask)
        g_e = get_gabor_desc(img_e, G, w_size, scale, mask)
    else:
        g_h = get_gabor_desc(img_h, G, w_size, scale)
        g_e = get_gabor_desc(img_e, G, w_size, scale)

    with open(out_file, 'w') as f:
        for d in g_h:
            f.write('\t'.join(str(x) for x in d))
            f.write('\n')
        for d in g_e:
            f.write('\t'.join(str(x) for x in d))
            f.write('\n')

    return
Example #8
0
def main():
    p = opt.ArgumentParser(description="""
            Extracts features from annotated regions and constructs a codebook of a given size.
            """)
    p.add_argument('in_file', action='store', help='a file with image file, annotation file and label (0/1)')
    p.add_argument('out_file', action='store', help='resulting model file name')
    #p.add_argument('codebook_size', action='store', help='codebook size', type=int)
    p.add_argument('-t', '--threshold', action='store', type=int, default=5000,
                   help='Hessian threshold for SURF features.')
    p.add_argument('-s', '--standardize', action='store_true', default=False,
                   help='should the features be standardized before codebook construction?')
    p.add_argument('-v', '--verbose', action='store_true', help='verbose?')
    
    args = p.parse_args()
    th = args.threshold
    
    all_image_names, all_descriptors = [], []
    all_roi = []
    y = []
    unique_image_names = []
    with open(args.in_file, mode='r') as fin:
        for l in fin.readlines():
            l = l.strip()
            if len(l) == 0:
                break
            img_file, annot_file, lbl = [z_ for z_ in l.split()][0:3]  # file names: image and its annotation and label
            y.append(int(lbl))
            
            if args.verbose:
                print("Image:", img_file)
                
            img = cv2.imread(img_file)
            coords = np.fromfile(annot_file, dtype=int, sep=' ')  # x y - values
            coords = np.reshape(coords, (coords.size/2, 2), order='C')
            # get the bounding box:
            xmin, ymin = coords.min(axis=0)
            xmax, ymax = coords.max(axis=0)

            if args.verbose:
                print("\t...H&E extraction")

            img = img[ymin:ymax+2, xmin:xmax+2, :]                # keep only the region of interest
            img_h, _ = rgb2he(img, normalize=True)                # get the H- component
            img_h = equalize_adapthist(img_h)
            img_h = rescale_intensity(img_h, out_range=(0,255))
            
            # make sure the dtype is right for image and the mask: OpenCV is sensitive to data type
            img_h = img_h.astype(np.uint8)

            if args.verbose:
                print("\t...building mask")
                
            mask = np.zeros(img_h.shape, dtype=np.uint8)
            r, c = skimage.draw.polygon(coords[:,1]-ymin, coords[:,0]-xmin) # adapt to new image...
            mask[r,c] = 1                                         # everything outside the region is black
            
            if args.verbose:
                print("\t...feature detection and computation")
            
            img_h *= mask
            feat = cv2.xfeatures2d.SURF_create(hessianThreshold=th)
            keyp, desc = feat.detectAndCompute(img_h, mask)
            
            if args.verbose:
                print("\t...", str(len(keyp)), "features extracted")
                
            all_descriptors.extend(desc)
            all_image_names.extend([img_file] * len(keyp))
            unique_image_names.append(img_file)            
        # end for
            
    X = np.hstack(all_descriptors)
    X = np.reshape(X, (len(all_descriptors), all_descriptors[0].size), order='C')
    if args.standardize:
        # make sure each variable (column) is mean-centered and has unit standard deviation
        Xm = np.mean(X, axis=0)
        Xs = np.std(X, axis=0)
        Xs[np.isclose(Xs, 1e-16)] = 1.0
        X = (X - Xm) / Xs
    
    y = np.array(y, dtype=int)
    
    rng = np.random.RandomState(0)
    acc = []                           # will keep accuracy of the classifier
    vqs = []                           # all quantizers, to find the best
    for k in np.arange(10, 121, 10):
        # Method:
        # -generate a codebook with k codewords
        # -re-code the data
        # -compute frequencies
        # -estimate classification on best 10 features
        
        if args.verbose:
            print("\nK-means clustering (k =", str(k), ")")
            print("\t...with", str(X.shape[0]), "points")
        
        #-codebook and re-coding
        vq = MiniBatchKMeans(n_clusters=k, random_state=rng,
                         batch_size=500, compute_labels=True, verbose=False)   # vector quantizer
        vq.fit(X)
        vqs.append(vq)
        
        #-codeword frequencies
        frq = np.zeros((len(unique_image_names), k))
        for i in range(vq.labels_.size):
            frq[unique_image_names.index(all_image_names[i]), vq.labels_[i]] += 1.0

        for i in range(len(unique_image_names)):
            if frq[i, :].sum() > 0:
                frq[i, :] /= frq[i, :].sum()

        if args.verbose:
            print("...\tfeature selection (t-test)")
        pv = np.ones(k)
        for i in range(k):
            _, pv[i] = ttest_ind(frq[y == 0, i], frq[y == 1, i])
        idx = np.argsort(pv)         # order of the p-values
        if args.verbose:
            print("\t...classification performance estimation")
        clsf = LDA(solver='lsqr', shrinkage='auto').fit(frq[:,idx[:10]], y) # keep top 10 features
        acc.append(clsf.score(frq[:, idx[:10]], y))
    
    acc = np.array(acc)
    k = np.arange(10, 121, 10)[acc.argmax()]  # best k
    if args.verbose:
        print("\nOptimal codebook size:", str(k))

    # final codebook:
    vq = vqs[acc.argmax()]

    # compute the average distance and std.dev. of the points in each cluster:
    avg_dist = np.zeros(k)
    sd_dist = np.zeros(k)
    for k in range(0, k):
        d = numpy.linalg.norm(X[vq.labels_ == k, :] - vq.cluster_centers_[k, :], axis=1)
        avg_dist[k] = d.mean()
        sd_dist[k] = d.std()

    with ModelPersistence(args.out_file, 'c', format='pickle') as d:
        d['codebook'] = vq
        d['shift'] = Xm
        d['scale'] = Xs
        d['standardize'] = args.standardize
        d['avg_dist_to_centroid'] = avg_dist
        d['stddev_dist_to_centroid'] = sd_dist

    return True
Example #9
0
def main():
    p = opt.ArgumentParser(description="""
            Extracts features from annotated regions and constructs a codebook of a given size.
            """)
    p.add_argument(
        'in_file',
        action='store',
        help='a file with image file, annotation file and label (0/1)')
    p.add_argument('out_file',
                   action='store',
                   help='resulting model file name')
    #p.add_argument('codebook_size', action='store', help='codebook size', type=int)
    p.add_argument('-t',
                   '--threshold',
                   action='store',
                   type=int,
                   default=5000,
                   help='Hessian threshold for SURF features.')
    p.add_argument(
        '-s',
        '--standardize',
        action='store_true',
        default=False,
        help='should the features be standardized before codebook construction?'
    )
    p.add_argument('-v', '--verbose', action='store_true', help='verbose?')

    args = p.parse_args()
    th = args.threshold

    all_image_names, all_descriptors = [], []
    all_roi = []
    y = []
    unique_image_names = []
    with open(args.in_file, mode='r') as fin:
        for l in fin.readlines():
            l = l.strip()
            if len(l) == 0:
                break
            img_file, annot_file, lbl = [
                z_ for z_ in l.split()
            ][0:3]  # file names: image and its annotation and label
            y.append(int(lbl))

            if args.verbose:
                print("Image:", img_file)

            img = cv2.imread(img_file)
            coords = np.fromfile(annot_file, dtype=int,
                                 sep=' ')  # x y - values
            coords = np.reshape(coords, (coords.size / 2, 2), order='C')
            # get the bounding box:
            xmin, ymin = coords.min(axis=0)
            xmax, ymax = coords.max(axis=0)

            if args.verbose:
                print("\t...H&E extraction")

            img = img[ymin:ymax + 2,
                      xmin:xmax + 2, :]  # keep only the region of interest
            img_h, _ = rgb2he(img, normalize=True)  # get the H- component
            img_h = equalize_adapthist(img_h)
            img_h = rescale_intensity(img_h, out_range=(0, 255))

            # make sure the dtype is right for image and the mask: OpenCV is sensitive to data type
            img_h = img_h.astype(np.uint8)

            if args.verbose:
                print("\t...building mask")

            mask = np.zeros(img_h.shape, dtype=np.uint8)
            r, c = skimage.draw.polygon(coords[:, 1] - ymin, coords[:, 0] -
                                        xmin)  # adapt to new image...
            mask[r, c] = 1  # everything outside the region is black

            if args.verbose:
                print("\t...feature detection and computation")

            img_h *= mask
            feat = cv2.xfeatures2d.SURF_create(hessianThreshold=th)
            keyp, desc = feat.detectAndCompute(img_h, mask)

            if args.verbose:
                print("\t...", str(len(keyp)), "features extracted")

            all_descriptors.extend(desc)
            all_image_names.extend([img_file] * len(keyp))
            unique_image_names.append(img_file)
        # end for

    X = np.hstack(all_descriptors)
    X = np.reshape(X, (len(all_descriptors), all_descriptors[0].size),
                   order='C')
    if args.standardize:
        # make sure each variable (column) is mean-centered and has unit standard deviation
        Xm = np.mean(X, axis=0)
        Xs = np.std(X, axis=0)
        Xs[np.isclose(Xs, 1e-16)] = 1.0
        X = (X - Xm) / Xs

    y = np.array(y, dtype=int)

    rng = np.random.RandomState(0)
    acc = []  # will keep accuracy of the classifier
    vqs = []  # all quantizers, to find the best
    for k in np.arange(10, 121, 10):
        # Method:
        # -generate a codebook with k codewords
        # -re-code the data
        # -compute frequencies
        # -estimate classification on best 10 features

        if args.verbose:
            print("\nK-means clustering (k =", str(k), ")")
            print("\t...with", str(X.shape[0]), "points")

        #-codebook and re-coding
        vq = MiniBatchKMeans(n_clusters=k,
                             random_state=rng,
                             batch_size=500,
                             compute_labels=True,
                             verbose=False)  # vector quantizer
        vq.fit(X)
        vqs.append(vq)

        #-codeword frequencies
        frq = np.zeros((len(unique_image_names), k))
        for i in range(vq.labels_.size):
            frq[unique_image_names.index(all_image_names[i]),
                vq.labels_[i]] += 1.0

        for i in range(len(unique_image_names)):
            if frq[i, :].sum() > 0:
                frq[i, :] /= frq[i, :].sum()

        if args.verbose:
            print("...\tfeature selection (t-test)")
        pv = np.ones(k)
        for i in range(k):
            _, pv[i] = ttest_ind(frq[y == 0, i], frq[y == 1, i])
        idx = np.argsort(pv)  # order of the p-values
        if args.verbose:
            print("\t...classification performance estimation")
        clsf = LDA(solver='lsqr',
                   shrinkage='auto').fit(frq[:, idx[:10]],
                                         y)  # keep top 10 features
        acc.append(clsf.score(frq[:, idx[:10]], y))

    acc = np.array(acc)
    k = np.arange(10, 121, 10)[acc.argmax()]  # best k
    if args.verbose:
        print("\nOptimal codebook size:", str(k))

    # final codebook:
    vq = vqs[acc.argmax()]

    # compute the average distance and std.dev. of the points in each cluster:
    avg_dist = np.zeros(k)
    sd_dist = np.zeros(k)
    for k in range(0, k):
        d = numpy.linalg.norm(X[vq.labels_ == k, :] -
                              vq.cluster_centers_[k, :],
                              axis=1)
        avg_dist[k] = d.mean()
        sd_dist[k] = d.std()

    with ModelPersistence(args.out_file, 'c', format='pickle') as d:
        d['codebook'] = vq
        d['shift'] = Xm
        d['scale'] = Xs
        d['standardize'] = args.standardize
        d['avg_dist_to_centroid'] = avg_dist
        d['stddev_dist_to_centroid'] = sd_dist

    return True
Example #10
0
def main():
    p = opt.ArgumentParser(description="""
            Extracts features from annotated regions and constructs a codebook of a given size.
            """)
    p.add_argument('in_file',
                   action='store',
                   help='a file with pairs of image and annotation files')
    p.add_argument('out_file',
                   action='store',
                   help='resulting model file name')
    p.add_argument('codebook_size',
                   action='store',
                   help='codebook size',
                   type=int)
    p.add_argument('-t',
                   '--threshold',
                   action='store',
                   type=int,
                   default=5000,
                   help='Hessian threshold for SURF features.')
    p.add_argument(
        '-s',
        '--standardize',
        action='store_true',
        default=False,
        help='should the features be standardized before codebook construction?'
    )
    p.add_argument('-x',
                   action='store_true',
                   help='save the image patches closes to the code blocks?')
    p.add_argument('-v', '--verbose', action='store_true', help='verbose?')

    args = p.parse_args()
    th = args.threshold

    all_key_points, all_descriptors, all_image_names = [], [], []
    all_roi = []
    with open(args.in_file, mode='r') as fin:
        for l in fin.readlines():
            l = l.strip()
            if len(l) == 0:
                break
            img_file, annot_file = [
                z_ for z_ in l.split()
            ][0:2]  # file names: image and its annotation

            if args.verbose:
                print("Image:", img_file)

            img = cv2.imread(img_file)
            coords = np.fromfile(annot_file, dtype=int,
                                 sep=' ')  # x y - values
            coords = np.reshape(coords, (coords.size / 2, 2), order='C')
            # get the bounding box:
            xmin, ymin = coords.min(axis=0)
            xmax, ymax = coords.max(axis=0)

            if args.verbose:
                print("\t...H&E extraction")

            img = img[ymin:ymax + 2,
                      xmin:xmax + 2, :]  # keep only the region of interest
            img_h, _ = rgb2he(img, normalize=True)  # get the H- component
            img_h = equalize_adapthist(img_h)
            img_h = rescale_intensity(img_h, out_range=(0, 255))

            # make sure the dtype is right for image and the mask: OpenCV is sensitive to data type
            img_h = img_h.astype(np.uint8)

            if args.verbose:
                print("\t...building mask")

            mask = np.zeros(img_h.shape, dtype=np.uint8)
            r, c = skimage.draw.polygon(coords[:, 1] - ymin, coords[:, 0] -
                                        xmin)  # adapt to new image...
            mask[r, c] = 1  # everything outside the region is black

            if args.verbose:
                print("\t...feature detection and computation")

            img_h *= mask
            feat = cv2.xfeatures2d.SURF_create(hessianThreshold=th)
            keyp, desc = feat.detectAndCompute(img_h, mask)

            if args.verbose:
                print("\t...", str(len(keyp)), "features extracted")

            all_descriptors.extend(desc)
            if args.x:
                # only needed if saving patches:
                all_key_points.extend(keyp)
                all_image_names.extend([img_file] * len(keyp))
                all_roi.extend([(xmin, xmax, ymin, ymax)] * len(keyp))
        # end for

    if args.verbose:
        print("\nK-means clustering")

    X = np.hstack(all_descriptors)
    X = np.reshape(X, (len(all_descriptors), all_descriptors[0].size),
                   order='C')
    if args.standardize:
        # make sure each variable (column) is mean-centered and has unit standard deviation
        Xm = np.mean(X, axis=0)
        Xs = np.std(X, axis=0)
        Xs[np.isclose(Xs, 1e-16)] = 1.0
        X = (X - Xm) / Xs

    if args.verbose:
        print("\t...with", str(X.shape[0]), "points")

    rng = np.random.RandomState(0)
    vq = MiniBatchKMeans(n_clusters=args.codebook_size,
                         random_state=rng,
                         batch_size=500,
                         compute_labels=True,
                         verbose=False)  # vector quantizer

    vq.fit(X)

    # compute the average distance and std.dev. of the points in each cluster:
    avg_dist = np.zeros(args.codebook_size)
    sd_dist = np.zeros(args.codebook_size)
    for k in range(0, args.codebook_size):
        d = numpy.linalg.norm(X[vq.labels_ == k, :] -
                              vq.cluster_centers_[k, :],
                              axis=1)
        avg_dist[k] = d.mean()
        sd_dist[k] = d.std()

    with ModelPersistence(args.out_file, 'c', format='pickle') as d:
        d['codebook'] = vq
        d['shift'] = Xm
        d['scale'] = Xs
        d['standardize'] = args.standardize
        d['avg_dist_to_centroid'] = avg_dist
        d['stddev_dist_to_centroid'] = sd_dist

    if args.x:
        # find the closest patches to each centroid:
        idx = np.zeros(args.codebook_size, dtype=np.int)
        d = np.zeros(X.shape[0])
        for k in range(0, args.codebook_size):
            for i in range(0, X.shape[0]):
                d[i] = numpy.linalg.norm(X[i, :] - vq.cluster_centers_[k, :])
            idx[k] = d.argmin(
            )  # the index of the closest patch to k-th centroid
        for k in range(0, args.codebook_size):
            i = idx[k]
            x, y = all_key_points[i].pt
            x = int(np.round(x))
            y = int(np.round(y))
            r = all_key_points[i].size  # diameter of the region
            img = cv2.imread(all_image_names[i])
            print("Image:", all_image_names[i],
                  "\tPatch (row_min->max, col_min->max):",
                  str(y + all_roi[i][2] - int(r / 2)),
                  str(y + all_roi[i][2] + int(r / 2)),
                  str(x + all_roi[i][0] - int(r / 2)),
                  str(x + all_roi[i][0] + int(r / 2)))
            patch = img[y + all_roi[i][2] - int(r / 2):y + all_roi[i][2] +
                        int(r / 2), x + all_roi[i][0] - int(r / 2):x +
                        all_roi[i][0] + int(r / 2), :]
            cv2.imwrite('codeblock_' + str(k) + '.png', patch)

    return True