def approximate_kmeans(data, K=1e6, max_iters=1000, flann_pref=None): if flann_pref == None: flann_pref = Pref() flann_pref.algorithm = Pref("kdtree") flann_pref.trees = Pref(8) flann_pref.checks = Pref(128) flann_args = flann_pref.to_dict() float_data = np.array(data, dtype=np.float32) N = float_data.shape[0] print ("Approximately clustering %d data vectors into %d clusters" % (N, K)) np.random.seed(seed=0) # For Reproducibility # Initialize to Random Cluster Centers centx = np.random.choice(N, size=K, replace=False) cent = np.copy(float_data[centx]) assign = alloc_lists(K) # List for each cluster center with assigned indexes for iterx in xrange(0, max_iters): print "Iteration " + str(iterx) # Step 1: Find Nearest Neighbors flann = FLANN() flann.build_index(data_vecs, **flann_args) (index_list, dist_list) = flann.nn_index(query_vecs, K, checks=flann_args["checks"]) return (index_list, dist_list) datax2_centx, _ = flann_one_time(cent, float_data, 1, flann_args) # Step 2: Assign data to cluster centers datax_sort = datax2_centx.argsort() centx_sort = datax2_centx[datax_sort] # Efficiently Trace over sorted centers with two pointers. Take care # To include the last batch of datavecs with the same center_index converged = True prev_centx = -1 _L = 0 dbg_total_assigned = 0 dbg_assigned_list = [] for _R in xrange(N + 1): # Loop over datapoints, going 1 past the end, and group them # data = 0[ . . . . . . . . . . . . .]N # ptrs = L R # |- k -|L R # |- k+1 |L R # |_K| if _R == N or centx_sort[_L] != centx_sort[_R]: # We found a group centx = centx_sort[_L] # Assign this group cluster index: centx # SPECIAL CASE: ( akmeans might not assign everything ) if centx - prev_centx > 1: # Check if a cluster got skipped for skipx in xrange(prev_centx + 1, centx): print (" Skipping Index:" + str(skipx)) if len(assign[skipx]) != 0: converged = False assign[skipx] = [] prev_centx = centx # Set Assignments num_members = np.float32(_R - _L) dbg_total_assigned += num_members centx_membx = datax_sort[_L:_R] # DBG CODE, keep track of data vectors you've assigned # print(' Assigning %d data vectors to center index: %d' % (num_members, centx) ) # for x in centx_membx: # dbg_assigned_list.append(x) # /DBGCODE if np.all(assign[centx] != centx_membx): converged = False assign[centx] = centx_membx # Recompute Centers cent[centx] = float_data[centx_membx, :].sum(axis=0) / num_members _L = _R # print(' Did Assignment of %d centers' % prev_centx) # print(' Assigned %d datavectors in total' % dbg_total_assigned) # SPECIAL CASE: has to run at the end again if prev_centx < K: # Check if a cluster got skipped at the end for skipx in xrange(prev_centx + 1, K): print (" Cluster Index %d was empty:" % skipx) if len(assign[skipx]) != 0: converged = False assign[skipx] = [] prev_centx = centx if converged: # Assignments have not changed print "akmeans converged in " + str(iterx) + " iterations" break return cent, assign
def init_preferences(am, default_bit=False): iom = am.hs.iom if am.algo_prefs == None: am.algo_prefs = Pref(fpath=iom.get_prefs_fpath('algo_prefs')) #Define the pipeline stages am.algo_prefs.preproc = Pref( parent=am.algo_prefs) # Low Level Chip Operations am.algo_prefs.chiprep = Pref( parent=am.algo_prefs) # Extracting Chip Features am.algo_prefs.model = Pref(parent=am.algo_prefs, hidden=True) # Building the model am.algo_prefs.query = Pref( parent=am.algo_prefs) # Searching the model am.algo_prefs.results = Pref( parent=am.algo_prefs) # Searching the model # --- Chip Preprocessing --- # (selection, options, prefs? ) am.algo_prefs.preproc.sqrt_num_pxls = Pref(700) am.algo_prefs.preproc.autocontrast_bit = Pref(False, hidden=True) am.algo_prefs.preproc.bilateral_filt_bit = Pref(False) am.algo_prefs.preproc.histeq_bit = Pref(True) am.algo_prefs.preproc.contrast_stretch_bit = Pref(False) am.algo_prefs.preproc.adapt_histeq_bit = Pref(False) # --- Chip Representation --- # Currently one feature detector and one feature descriptor is chosen am.algo_prefs.chiprep.use_gravity_vector = True opencv_detectors = [ 'FAST', 'STAR', 'SIFT', 'SURF', 'ORB', 'BRISK', 'MSER', 'GFTT', 'HARRIS', 'Dense', 'SimpleBlob' ] # Keypoint Detector: Add a list of opencv detectors am.algo_prefs.chiprep.kpts_detector = Pref(0, choices=['heshesaff'] + opencv_detectors) # Keypoint Detector Parameters: Add a list of associated preferences # types = { 0:'int', 1:'bool', 2:'double', 7:'float', 9:'int64', 11:'unsigned char'} for detector_type in opencv_detectors: det_dep = (am.algo_prefs.chiprep.kpts_detector_internal, detector_type) det_pref = Pref(depeq=det_dep) det = cv2.FeatureDetector_create(detector_type) for param_name in det.getParams(): param_type = det.paramType(param_name) if param_type in [0, 8, 9, 11]: param_val = det.getInt(param_name) elif param_type == 1: param_val = det.getBool(param_name) elif param_type in [2, 7]: param_val = det.getDouble(param_name) else: raise Exception('name: ' + str(param_name) + ' type: ' + str(param_type)) det_pref[param_name] = param_val am.algo_prefs.chiprep[detector_type + '_params'] = det_pref am.algo_prefs.chiprep.kpts_extractor = Pref( 0, choices=('SIFT', ), hidden=True) #, '#SURF', '#BRISK')) # --- Vocabulary --- am.algo_prefs.model.quantizer = Pref(0, choices=('naive_bayes', 'akmeans'), hidden=True) #nbnn_dep = (am.algo_prefs.model.quantizer_internal, 'naive_bayes') #am.algo_prefs.model.naive_bayes = Pref(depeq=nbnn_dep) #am.algo_prefs.model.naive_bayes.num_nearest = Pref('wip') #am.algo_prefs.model.naive_bayes.pseudo_num_words = Pref('wip') akm_dep = (am.algo_prefs.model.quantizer_internal, 'akmeans') am.algo_prefs.model.akmeans = Pref(depeq=akm_dep, hidden=True) am.algo_prefs.model.akmeans.num_words = Pref(1000) am.algo_prefs.model.akmeans.max_iters = Pref(1000) hkm_dep = (am.algo_prefs.model.quantizer_internal, 'hkmeans') am.algo_prefs.model.hkmeans = Pref(depeq=hkm_dep, hidden=True) am.algo_prefs.model.hkmeans.branching = Pref(10) am.algo_prefs.model.hkmeans.depth = Pref(6) flann_kdtree = Pref(hidden=True) flann_kdtree.algorithm = Pref( default=1, choices=['linear', 'kdtree', 'kmeanes', 'composite', 'autotuned']) # Build Prefs flann_kdtree.trees = Pref(8, min=0, max=30) flann_kdtree.checks = Pref(128, min=0, max=4096) # Search Prefs #Autotuned Specific Prefeters autotune_spef = (flann_kdtree.algorithm_internal, 'autotuned') flann_kdtree.target_precision = Pref(0.95, depeq=autotune_spef) flann_kdtree.build_weight = Pref(0.01, depeq=autotune_spef) flann_kdtree.memory_weight = Pref(0.86, depeq=autotune_spef,\ doc='the time-search tradeoff') flann_kdtree.sample_fraction = Pref(0.86, depeq=autotune_spef,\ doc='the train_fraction') # HKMeans Specific Prefeters hkmeans_spef = (flann_kdtree.algorithm_internal, 'kmeans' ) #Autotuned Specific Prefeters flann_kdtree.branching = Pref(10, depeq=hkmeans_spef) flann_kdtree.iterations = Pref(6, depeq=hkmeans_spef, doc='num levels') flann_kdtree.centers_init = Pref(choices=['random', 'gonzales', 'kmeansapp'],\ depeq=hkmeans_spef) flann_kdtree.cb_index = Pref(0, min=0, max=5, depeq=hkmeans_spef, doc=''' this parameter (cluster boundary index) influences the way exploration is performed in the hierarchical kmeans tree. When cb index is zero the next kmeans domain to be explored is choosen to be the one with the closest center. A value greater then zero also takes into account the size of the domain.''') am.algo_prefs.model.indexer = flann_kdtree #Pref(0, choices=[flann_kdtree]) # --- Query Prefs --- am.algo_prefs.query.k = Pref(1, min=1, max=50) am.algo_prefs.query.num_rerank = Pref(1000, min=0) am.algo_prefs.query.spatial_thresh = Pref(0.05, min=0, max=1) am.algo_prefs.query.sigma_thresh = Pref(0.05, min=0, max=1, hidden=True) #: Unimplemented am.algo_prefs.query.method = Pref( 2, choices=['COUNT', 'DIFF', 'LNRAT', 'RAT']) #, '#TFIDF' am.algo_prefs.query.score = Pref(0, choices=['cscore', 'nscore' ]) # move to results? am.algo_prefs.query.self_as_result_bit = Pref( False) #: Return self (in terms of name) in results am.algo_prefs.query.remove_other_names = Pref( False ) #: Remove all results with the same identified name as the query # --- Result Prefs --- am.algo_prefs.results.score = Pref( 0, choices=('cscore', 'nscore')) # move to results? am.algo_prefs.results.one_result_per_name = Pref( False) #: Return self (in terms of name) in results am.algo_prefs.results.match_threshold = Pref(0) am.algo_prefs.results.min_num_results = Pref(5) am.algo_prefs.results.max_num_results = Pref(5) am.algo_prefs.results.extra_num_results = Pref(0) if not default_bit: am.algo_prefs.load()
def approximate_kmeans(data, K=1e6, max_iters=1000, flann_pref=None): if flann_pref == None: flann_pref = Pref() flann_pref.algorithm = Pref('kdtree') flann_pref.trees = Pref(8) flann_pref.checks = Pref(128) flann_args = flann_pref.to_dict() float_data = np.array(data, dtype=np.float32) N = float_data.shape[0] print('Approximately clustering %d data vectors into %d clusters' % (N, K)) np.random.seed(seed=0) # For Reproducibility # Initialize to Random Cluster Centers centx = np.random.choice(N, size=K, replace=False) cent = np.copy(float_data[centx]) assign = alloc_lists( K) # List for each cluster center with assigned indexes for iterx in xrange(0, max_iters): print "Iteration " + str(iterx) # Step 1: Find Nearest Neighbors flann = FLANN() flann.build_index(data_vecs, **flann_args) (index_list, dist_list) = flann.nn_index(query_vecs, K, checks=flann_args['checks']) return (index_list, dist_list) datax2_centx, _ = flann_one_time(cent, float_data, 1, flann_args) # Step 2: Assign data to cluster centers datax_sort = datax2_centx.argsort() centx_sort = datax2_centx[datax_sort] # Efficiently Trace over sorted centers with two pointers. Take care # To include the last batch of datavecs with the same center_index converged = True prev_centx = -1 _L = 0 dbg_total_assigned = 0 dbg_assigned_list = [] for _R in xrange( N + 1 ): #Loop over datapoints, going 1 past the end, and group them # data = 0[ . . . . . . . . . . . . .]N # ptrs = L R # |- k -|L R # |- k+1 |L R # |_K| if _R == N or centx_sort[_L] != centx_sort[_R]: # We found a group centx = centx_sort[ _L] # Assign this group cluster index: centx # SPECIAL CASE: ( akmeans might not assign everything ) if centx - prev_centx > 1: #Check if a cluster got skipped for skipx in xrange(prev_centx + 1, centx): print(" Skipping Index:" + str(skipx)) if len(assign[skipx]) != 0: converged = False assign[skipx] = [] prev_centx = centx # Set Assignments num_members = np.float32(_R - _L) dbg_total_assigned += num_members centx_membx = datax_sort[_L:_R] #DBG CODE, keep track of data vectors you've assigned #print(' Assigning %d data vectors to center index: %d' % (num_members, centx) ) #for x in centx_membx: #dbg_assigned_list.append(x) #/DBGCODE if np.all(assign[centx] != centx_membx): converged = False assign[centx] = centx_membx # Recompute Centers cent[centx] = float_data[centx_membx, :].sum( axis=0) / num_members _L = _R #print(' Did Assignment of %d centers' % prev_centx) #print(' Assigned %d datavectors in total' % dbg_total_assigned) # SPECIAL CASE: has to run at the end again if prev_centx < K: #Check if a cluster got skipped at the end for skipx in xrange(prev_centx + 1, K): print(' Cluster Index %d was empty:' % skipx) if len(assign[skipx]) != 0: converged = False assign[skipx] = [] prev_centx = centx if converged: # Assignments have not changed print 'akmeans converged in ' + str(iterx) + ' iterations' break return cent, assign
def init_preferences(am, default_bit=False): iom = am.hs.iom if am.algo_prefs == None: am.algo_prefs = Pref(fpath=iom.get_prefs_fpath('algo_prefs')) #Define the pipeline stages am.algo_prefs.preproc = Pref(parent=am.algo_prefs) # Low Level Chip Operations am.algo_prefs.chiprep = Pref(parent=am.algo_prefs) # Extracting Chip Features am.algo_prefs.model = Pref(parent=am.algo_prefs, hidden=True) # Building the model am.algo_prefs.query = Pref(parent=am.algo_prefs) # Searching the model am.algo_prefs.results = Pref(parent=am.algo_prefs) # Searching the model # --- Chip Preprocessing --- # (selection, options, prefs? ) am.algo_prefs.preproc.sqrt_num_pxls = Pref(700) am.algo_prefs.preproc.autocontrast_bit = Pref(False,hidden=True) am.algo_prefs.preproc.bilateral_filt_bit = Pref(False) am.algo_prefs.preproc.histeq_bit = Pref(True) am.algo_prefs.preproc.contrast_stretch_bit = Pref(False) am.algo_prefs.preproc.adapt_histeq_bit = Pref(False) # --- Chip Representation --- # Currently one feature detector and one feature descriptor is chosen am.algo_prefs.chiprep.use_gravity_vector = True opencv_detectors = ['FAST', 'STAR', 'SIFT', 'SURF', 'ORB', 'BRISK', 'MSER', 'GFTT', 'HARRIS', 'Dense', 'SimpleBlob'] # Keypoint Detector: Add a list of opencv detectors am.algo_prefs.chiprep.kpts_detector = Pref(0, choices=['heshesaff']+opencv_detectors) # Keypoint Detector Parameters: Add a list of associated preferences # types = { 0:'int', 1:'bool', 2:'double', 7:'float', 9:'int64', 11:'unsigned char'} for detector_type in opencv_detectors: det_dep = (am.algo_prefs.chiprep.kpts_detector_internal, detector_type) det_pref = Pref(depeq=det_dep) det = cv2.FeatureDetector_create(detector_type) for param_name in det.getParams(): param_type = det.paramType(param_name) if param_type in [0, 8, 9, 11]: param_val = det.getInt(param_name) elif param_type == 1: param_val = det.getBool(param_name) elif param_type in [2,7]: param_val = det.getDouble(param_name) else: raise Exception('name: '+str(param_name) + ' type: '+str(param_type)) det_pref[param_name] = param_val am.algo_prefs.chiprep[detector_type+'_params'] = det_pref am.algo_prefs.chiprep.kpts_extractor = Pref(0, choices=('SIFT',), hidden=True) #, '#SURF', '#BRISK')) # --- Vocabulary --- am.algo_prefs.model.quantizer = Pref(0, choices=('naive_bayes', 'akmeans'), hidden=True) #nbnn_dep = (am.algo_prefs.model.quantizer_internal, 'naive_bayes') #am.algo_prefs.model.naive_bayes = Pref(depeq=nbnn_dep) #am.algo_prefs.model.naive_bayes.num_nearest = Pref('wip') #am.algo_prefs.model.naive_bayes.pseudo_num_words = Pref('wip') akm_dep = (am.algo_prefs.model.quantizer_internal, 'akmeans') am.algo_prefs.model.akmeans = Pref(depeq=akm_dep, hidden=True) am.algo_prefs.model.akmeans.num_words = Pref(1000) am.algo_prefs.model.akmeans.max_iters = Pref(1000) hkm_dep = (am.algo_prefs.model.quantizer_internal, 'hkmeans') am.algo_prefs.model.hkmeans = Pref(depeq=hkm_dep, hidden=True) am.algo_prefs.model.hkmeans.branching = Pref(10) am.algo_prefs.model.hkmeans.depth = Pref(6) flann_kdtree = Pref(hidden=True) flann_kdtree.algorithm = Pref(default=1, choices=['linear', 'kdtree', 'kmeanes', 'composite', 'autotuned']) # Build Prefs flann_kdtree.trees = Pref(8, min=0, max=30) flann_kdtree.checks = Pref(128, min=0, max=4096) # Search Prefs #Autotuned Specific Prefeters autotune_spef = (flann_kdtree.algorithm_internal, 'autotuned') flann_kdtree.target_precision = Pref(0.95, depeq=autotune_spef) flann_kdtree.build_weight = Pref(0.01, depeq=autotune_spef) flann_kdtree.memory_weight = Pref(0.86, depeq=autotune_spef,\ doc='the time-search tradeoff') flann_kdtree.sample_fraction = Pref(0.86, depeq=autotune_spef,\ doc='the train_fraction') # HKMeans Specific Prefeters hkmeans_spef = (flann_kdtree.algorithm_internal, 'kmeans') #Autotuned Specific Prefeters flann_kdtree.branching = Pref(10, depeq=hkmeans_spef) flann_kdtree.iterations = Pref( 6, depeq=hkmeans_spef, doc='num levels') flann_kdtree.centers_init = Pref(choices=['random', 'gonzales', 'kmeansapp'],\ depeq=hkmeans_spef) flann_kdtree.cb_index = Pref(0, min=0, max=5, depeq=hkmeans_spef, doc=''' this parameter (cluster boundary index) influences the way exploration is performed in the hierarchical kmeans tree. When cb index is zero the next kmeans domain to be explored is choosen to be the one with the closest center. A value greater then zero also takes into account the size of the domain.''' ) am.algo_prefs.model.indexer = flann_kdtree #Pref(0, choices=[flann_kdtree]) # --- Query Prefs --- am.algo_prefs.query.k = Pref(1, min=1, max=50) am.algo_prefs.query.num_rerank = Pref(1000, min=0) am.algo_prefs.query.spatial_thresh = Pref(0.05, min=0, max=1) am.algo_prefs.query.sigma_thresh = Pref(0.05, min=0, max=1, hidden=True) #: Unimplemented am.algo_prefs.query.method = Pref(2, choices=['COUNT', 'DIFF', 'LNRAT', 'RAT']) #, '#TFIDF' am.algo_prefs.query.score = Pref(0, choices=['cscore','nscore']) # move to results? am.algo_prefs.query.self_as_result_bit = Pref(False) #: Return self (in terms of name) in results am.algo_prefs.query.remove_other_names = Pref(False) #: Remove all results with the same identified name as the query # --- Result Prefs --- am.algo_prefs.results.score = Pref(0, choices=('cscore','nscore')) # move to results? am.algo_prefs.results.one_result_per_name = Pref(False) #: Return self (in terms of name) in results am.algo_prefs.results.match_threshold = Pref(0) am.algo_prefs.results.min_num_results = Pref(5) am.algo_prefs.results.max_num_results = Pref(5) am.algo_prefs.results.extra_num_results = Pref(0) if not default_bit: am.algo_prefs.load()