Ejemplo n.º 1
0
def createDataset(sources,output):
    """
    Create a codebook for a bag of visual words representation using k-means clustering
    
    """
    global has_joblib
    out_path = str(output)
    # delete the output file
    if os.path.exists(out_path):
        os.remove(out_path)
       
    # first, list the source files
    fpaths_src, fnames_src = utils.listFiles(directory=os.path.abspath(sources), ext='png')
         
    n_imgs = len(fpaths_src)
    
    all_features_list = []
    
    # parallel implementation (default, if joblib available)
    if has_joblib:
        image_features = Parallel(n_jobs=args.njobs,verbose=5) (delayed(processImage)(fpaths_src, fnames_src, img_idx) for img_idx in range(n_imgs))
        # stack the individual images
        image_features = np.concatenate(image_features,axis=0)
        #print image_features.shape
        all_features_list.append(image_features)
    else:
        for img_idx in xrange(n_imgs):
            image_features = processImage(fpaths_src, fnames_src, img_idx)
            image_features = np.concatenate(image_features,axis=0)
            all_features_list.append(image_features)
        
        
    # create k clusters from all features
    print "Clustering (k=%s)"%str(args.k)
    feat_matrix = np.concatenate(all_features_list, axis=0).astype(np.float32)
    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
    flags = cv2.KMEANS_RANDOM_CENTERS
    _,labels,codebook = cv2.kmeans(
                                   feat_matrix,
                                   args.k,
                                   criteria,
                                   10,
                                   flags) 
        
    # write the codebook to the file using savetext() on the numpy array
    np.savetxt(out_path, 
               codebook, 
               delimiter=' ', 
               header=('Codebook, %s words, %s dimensions'%(str(args.k),str(feat_matrix.shape[1]))))
    
    return 0
Ejemplo n.º 2
0
def createDataset(sources,output,labels,sparse):
    """
    Create a dataset by vectorizing the images and writing them line by line to a txt-file.
    Each pixel is a feature and is thus stored in libsvm-format:
    
    [label] [index0:value0] [index1:value1] ... [indexN:valueN]
    
    """
    global has_joblib
    out_path = str(output)
    # delete the output file
    if os.path.exists(os.path.abspath(out_path)):
        os.remove(os.path.abspath(out_path))
    
    # first, list the source files
    fpaths_src, fnames_src = utils.listFiles(directory=os.path.abspath(sources), ext='png')
    
    label_map={}
    
    # read the label file
    if not (labels == None):
        label_map = utils.readLabelMap(labels)
        # check that the numbers match
        print("Number of images in label map : %s"%str(len(label_map.keys())-1))
        print("Number of images in source dir: %s"%str(len(fpaths_src)))
        assert len(label_map.keys())-1 == len(fpaths_src)
    
    # generate KNN classifier
    if not (args.codebook == 'None' or args.codebook == None):
        args.knn = getKNNClassifier()  
    else:
        args.knn = None
    
    # precompute number of images
    n_imgs = len(fpaths_src)
    
    # preallocate array
    # if augmentation, calculate (9*4+1)*n samples
    all_features_list = []
        
    # parallel implementation (default, if joblib available)
    if has_joblib:
        image_features = Parallel(n_jobs=args.njobs,verbose=5) (delayed(processImage)(fpaths_src, label_map, fnames_src, img_idx) for img_idx in range(n_imgs))
        # collect all images into a single matrix
        image_features = np.concatenate(image_features, axis=0)
        all_features_list.append(image_features)
    else:
        for img_idx in xrange(n_imgs):
            image_features = processImage(fpaths_src, label_map, fnames_src, img_idx)
            all_features_list.append(image_features)
    
    # make a 2D matrix from the list of features (stack all images vertically)
    feat_matrix = np.concatenate(all_features_list, axis=0).astype(np.float32)    
      
    # do scaling of each feature dimension 
    #if False:
    if not (args.scale == 0):
        print "Scaling data..."
        
        # preserve the labels
        label_vec = feat_matrix[:,0]
        feat_matrix = np.delete(feat_matrix,0,1)
        
        featurestats = np.zeros((2,feat_matrix.shape[1]))
        
        # use soft-normalization (zero-mean, unit var whitening)
        if (args.scale == 1):
            # if we specified featurestats from a training set, use them
            if not (args.featurestats == None):
                # load the statistics
                featurestats = loadFeatureStats()
                # featurestats contains 2 rows, first row = mean, second row = std
                # and n feature dimensions
                assert feat_matrix.shape[1]==featurestats.shape[1]
            else:
                pass
            
        
        # use hard-normalization 
        elif (args.scale == 2):
            # if we specified featurestats from a training set, use them
            if not (args.featurestats == None):
                # load the statistics
                featurestats = loadFeatureStats()
                # the featurestats contains 2 rows, first row = min, second row = max 
                # and n feature dimensions
                assert feat_matrix.shape[1]==featurestats.shape[1]
            else:
                pass
        
        
        # normalize each feature dimension
        for feat_idx in xrange(feat_matrix.shape[1]):
            feat_vec = feat_matrix[:,feat_idx]
            
            # soft-normalization (zero-mean, approx. unit variance)
            if (args.scale == 1): 
                # if feature statistics are specified
                if not (args.featurestats == None):
                    feat_mean = featurestats[0,feat_idx]
                    feat_std = featurestats[1,feat_idx]
                else:
                    # compute them from the data
                    feat_mean = feat_vec.mean()
                    feat_std = (feat_vec.std() + 1e-10)
                    # store them 
                    featurestats[0,feat_idx] = feat_mean
                    featurestats[1,feat_idx] = feat_std
                
                # shift to zero mean and (unit) variance
                feat_vec_scaled = (feat_vec - feat_mean) / (1.*feat_std)
                
            
            # hard-normalization (min/max = borders estimated from the (training) dataset)
            elif (args.scale == 2):
                if not (args.featurestats == None):
                    feat_min = featurestats[0,feat_idx]
                    feat_max = featurestats[1,feat_idx]
                else:
                    # compute them freshly
                    feat_min = np.min(feat_vec)
                    feat_max = np.max(feat_vec)
                    # store them 
                    featurestats[0,feat_idx] = feat_min
                    featurestats[1,feat_idx] = feat_max
                    
                # standardize/normalize between 0 and 1
                feat_vec_std = (feat_vec - feat_min) / (feat_max - feat_min + 1e-10)             
                
                # linearly scale between -1 and 1 
                feat_vec_scaled = (1.0*feat_vec_std * (1 - -1)) - 1
             
                     
            # set column back to matrix
            feat_matrix[:,feat_idx] = feat_vec_scaled
        
        # finally prepend the label_vec again
        feat_matrix = np.concatenate((np.reshape(label_vec,(feat_matrix.shape[0],1)),feat_matrix), axis=1)
        
        print "Done."
    else:
        print "Data may not be properly scaled, use the 'svm-scale' implementation of libsvm."
 
    if not (args.savefeaturestats == None):
        saveFeatureStats(featurestats)    

    #Parallel(n_jobs=args.njobs, verbose=5)(delayed(function)(params) for i in range(10))
    # open the output file
    output_file = open(os.path.abspath(out_path), 'wb')

    # run through the feature matrix    
    print "Writing %s rows and %s cols to file..."%(feat_matrix.shape)
    # parallel implementation (default, if joblib available)
    if has_joblib:
        lines = Parallel(n_jobs=args.njobs, verbose=5)(delayed(writeLine)(i, feat_matrix) for i in range(feat_matrix.shape[0]))
        output_file.writelines(lines)   
    else:
        for i in xrange(feat_matrix.shape[0]):
            line = writeLine(i, feat_matrix)
            output_file.writelines(line)
    
    output_file.close()
    
    return 0