def main(args): data_dir = args.data_dir out_dir = args.out_dir if data_dir is None: raise ValueError('data_dir or database should be assigned.') container = EmbeddingContainer() container.load(data_dir) hg = HierarchicalGrouping(container) df = hg.auto_label_subgroup(label_id=1) print(df)
def __init__(self, embedding_size, prob_size, config_dict, mode='online'): """Evaluator Builder. The object builds evaluation functions according to the given configuration and manage shared data (embeddings, labels and attributes) in container objects. Args: embedding_size: Integer describes 1d embedding size. prob_size: Integer describes size of the logits. config_dict: Dict, loaded yaml foramt dict. mode: String, `online` or `offline`. Building procedure: (TODO @kv: update these steps) * parse the config * allocate containers * create evaluations * add datum * run evaluate * (optional) get update_ops TODO: - deprecate attribute container """ self.configs = ConfigParser(config_dict) # allocate shared embedding containers container_size = self.configs.container_size self.embedding_size = embedding_size self.prob_size = prob_size self.embedding_container = EmbeddingContainer(embedding_size, prob_size, container_size) self.mode = mode if self.mode not in ['online', 'offline']: raise ValueError('Evaluator mode: {} is not defined.'.format( self.mode)) self._build() self._instance_counter = 0 self._total_metrics = {} self._results = {} # Allocate general query interface if not self.configs.database[config_fields.database_type]: # TODO @kv: consistent check with query condition print('No attribute database') self.query_interface = None else: self.query_interface = QueryInterface(self.configs.database) print('Attribute database is initialized.')
def main(args): data_dir = args.data_dir out_dir = args.output_dir container = EmbeddingContainer() container.load(data_dir) tsne = TSNE(container, n_iter=args.iterations, n_jobs=args.n_jobs, perplexity=args.perplexity) tsne.run() tsne.save_fig(out_dir)
def main(args): data_dir = args.data_dir out_dir = args.out_dir query_command = args.query_command anchor_command = args.anchor_command # TODO: sanity check container = EmbeddingContainer() result = ResultContainer() container.load(data_dir) command = '{}->{}'.format(query_command, anchor_command) query_ids, anchor_ids = container.get_instance_id_by_cross_reference_command( command) query_embeddings = container.get_embedding_by_instance_ids(query_ids) anchor_embeddings = container.get_embedding_by_instance_ids(anchor_ids) num_of_anchor = anchor_embeddings.shape[0] num_of_query = query_embeddings.shape[0] agent = IndexAgent(agent_type='HNSW', instance_ids=anchor_ids, embeddings=anchor_embeddings) all_query_ids, all_retrieved_ids, all_retrieved_distances = [], [], [] with tqdm(total=num_of_query) as pbar: for _idx, (query_id, qeury_emb) in enumerate(zip(query_ids, query_embeddings)): retrieved_ids, retrieved_distances = agent.search( qeury_emb, top_k=num_of_anchor) retrieved_ids = np.squeeze(retrieved_ids) retrieved_distances = np.squeeze(retrieved_distances) all_query_ids.extend(np.array(query_id).repeat(num_of_anchor)) all_retrieved_ids.extend(retrieved_ids) all_retrieved_distances.extend(retrieved_distances) pbar.update() print('Indexing finished, {} retrieved events'.format( len(all_retrieved_ids))) print('Start exporting results...') start_time = time() result._event_buffer = pd.DataFrame({ Fields.query_instance_id: all_query_ids, Fields.retrieved_instance_id: all_retrieved_ids, Fields.retrieved_distance: all_retrieved_distances, }) result.save(out_dir) print('Done, saving results take {} seconds.'.format(time() - start_time))
def main(args): data_dir = args.data_dir out_dir = args.out_dir if data_dir is None: raise ValueError('data_dir or database should be assigned.') feature_object = FeatureObject() feature_object.load(data_dir) embeddings = feature_object.embeddings embeddings = np.squeeze(embeddings) filename_strings = feature_object.filename_strings label_ids = feature_object.label_ids label_names = feature_object.label_names instance_ids = np.arange(embeddings.shape[0]) # Push all embeddings into container embedding_container = EmbeddingContainer( embedding_size=embeddings.shape[1], prob_size=0, container_size=embeddings.shape[0]) for emb, inst_id, label_id in zip(embeddings, instance_ids, label_ids): embedding_container.add(inst_id, label_id, emb) manifold = Manifold(embedding_container, label_names) centers = manifold.class_center() c2c_matrix = manifold.center_to_center_relation() c2all_relation = manifold.center_to_all_instance_relation() for center_label, center_feature in centers.items(): manifold.distance_trace(center_label, center_feature, 200) manifold.locality_analysis()
def create_embedding_container_from_featobj(folder_path, verbose=True): """Directly load feature object into embedding container. Args: folder_path: string, path to the folder of FeatureObject verbose: Boolean, show the size of feature object if set True Return: container: EmbeddingContainer """ feature_importer = FeatureObject() feature_importer.load(folder_path) embeddings = feature_importer.embeddings filenames = feature_importer.filename_strings instance_ids = feature_importer.instance_ids labels = feature_importer.label_ids label_names = feature_importer.label_names probabilities = feature_importer.probabilities has_label_name = True if label_names is not None else False # TODO has_prob = True if probabilities is not None else False # pseudo instance_ids pseudo_instance_ids = np.arange(embeddings.shape[0]) if instance_ids is None or instance_ids.size == 0: instance_ids = pseudo_instance_ids num_feature, dim_feature = embeddings.shape if verbose: print('{} features with dim-{} are loaded'.format(num_feature, dim_feature)) # err handling: label_ids.shape == 0 container = EmbeddingContainer(embedding_size=dim_feature, prob_size=0, container_size=num_feature) if not has_label_name: for inst_id, feat, label in zip(instance_ids, embeddings, labels): # use filename_string as instance_id, convert to integer #for postfix in ['.png', '.jpg', '.jpeg', '.JPG']: # fn = fn.replace(postfix, '') #pseudo_instance_id = int(fn) inst_id = int(inst_id) container.add(inst_id, label, feat) else: for inst_id, feat, label, name in zip(instance_ids, embeddings, labels, label_names): inst_id = int(inst_id) container.add(inst_id, label, feat, label_name=name) return container
def main(): args = parser.parse_args() config_path = args.config data_type = args.data_type data_dir = args.data_dir out_dir = args.out_dir anchor_database_dir = args.anchor_database status = status_fields.not_determined # check input is given if not data_dir: raise ValueError('data_dir must be assigned!') if out_dir is not None: if not os.path.exists(out_dir): os.makedirs(out_dir) # argument logic if data_dir and anchor_database_dir: status = status_fields.evaluate_query_anchor elif data_dir and anchor_database_dir is None: status = status_fields.evaluate_single_container if not config_path: # TODO: @kv: Generate the default config. raise ValueError('evaluation configuration must be assigned!') try: with open(config_path, 'r') as fp: config_dict = yaml.load(fp) except: raise IOError('Can not load yaml from {}.'.format(config_path)) # TODO: create default config instead of error. # Prepare data container container = None for case in switch(status): print('{} Executes {}'.format(APP_SIGNATURE, status)) if case(status_fields.evaluate_single_container): container = EmbeddingContainer(name='single_container') if data_type in ['embedding_container', 'embedding_db']: container.load(data_dir) # end of switch case break if case(status_fields.evaluate_query_anchor): """TODO: Use native method: merge() 1. Merge two containers 2. Add `query->anchor` command in cross_reference 3. Change number of database """ container = EmbeddingContainer(name='query') anchor_container = EmbeddingContainer(name='anchor') # load query if data_type in ['embedding_container', 'embedding_db']: container.load(data_dir) # load anchor if data_type in ['embedding_container', 'embedding_db']: anchor_container.load(anchor_database_dir) container.merge(anchor_container, merge_key='merge_record', label_id_rearrange=True) # clear buffer anchor_container.clear() # Change config TODO: A little bit hacky, modify in future # TODO: It seems not work well _opt = config_fields.evaluation_options _rank = 'RankingEvaluation' _attr = config_fields.attribute _cref = config_fields.cross_reference _smp = config_fields.sampling _cmd = 'merge_record.query -> merge_record.anchor' config_dict[_opt][_rank][_attr][_cref] = list( filter(None, config_dict[_opt][_rank][_attr][_cref])) if _cmd not in config_dict[_opt][_rank][_attr][_cref]: config_dict[_opt][_rank][_attr][_cref].append(_cmd) config_dict[_opt][_rank][_smp][ 'num_of_db_instance_per_class'] = 1000 # end of switch case break # Build and run evaluation evaluator = EvaluatorBuilder(args.embedding_size, args.prob_size, config_dict, mode='offline') print(container) evaluator.add_container(container) evaluator.evaluate() # Show Results for eval_name, result_container in evaluator.results.items(): print(eval_name) display_name = display_namemap[ eval_name] if eval_name in display_namemap else eval_name reporter = ReportWriter(result_container) overall_report = reporter.overall_report print(overall_report) if out_dir: path = '/'.join([out_dir, 'result_{}'.format(display_name)]) result_container.save(path) if status == status_fields.evaluate_query_anchor and out_dir: path = '/'.join([out_dir, 'merged_container']) container.save(path)
def main(args): data_dir_1 = args.data_dir_1 data_dir_2 = args.data_dir_2 keyword = args.keyword name_data_1 = args.name_1 name_data_2 = args.name_2 output_dir = args.output_dir container = EmbeddingContainer(name=name_data_1) container.load(data_dir_1) db_container = EmbeddingContainer(name=name_data_2) db_container.load(data_dir_2) container.merge(db_container, merge_key=keyword) db_container.clear() print(container) print(container.DataFrame) container.save(output_dir)
class EvaluatorBuilder(object): """Evaluator Builder & Interface. """ def __init__(self, embedding_size, prob_size, config_dict, mode='online'): """Evaluator Builder. The object builds evaluation functions according to the given configuration and manage shared data (embeddings, labels and attributes) in container objects. Args: embedding_size: Integer describes 1d embedding size. prob_size: Integer describes size of the logits. config_dict: Dict, loaded yaml foramt dict. mode: String, `online` or `offline`. Building procedure: (TODO @kv: update these steps) * parse the config * allocate containers * create evaluations * add datum * run evaluate * (optional) get update_ops TODO: - deprecate attribute container """ self.configs = ConfigParser(config_dict) # allocate shared embedding containers container_size = self.configs.container_size self.embedding_size = embedding_size self.prob_size = prob_size self.embedding_container = EmbeddingContainer(embedding_size, prob_size, container_size) self.mode = mode if self.mode not in ['online', 'offline']: raise ValueError('Evaluator mode: {} is not defined.'.format( self.mode)) self._build() self._instance_counter = 0 self._total_metrics = {} self._results = {} # Allocate general query interface if not self.configs.database[config_fields.database_type]: # TODO @kv: consistent check with query condition print('No attribute database') self.query_interface = None else: self.query_interface = QueryInterface(self.configs.database) print('Attribute database is initialized.') def _build(self): """ Build: Parse the config and create evaluators. """ # Allocate evaluation object with corresponding configuration self.evaluations = {} for eval_name in self.configs.chosen_evaluation_names: if eval_name == eval_fields.classification and self.prob_size == 0: print( '{} is assigned, but prob_size == 0, remove from the chosen list.' .format(eval_name)) # remove the chosen name in the list self.configs.chosen_evaluation_names.remove(eval_name) continue eval_config = self.configs.get_eval_config(eval_name) self.evaluations[eval_name] = REGISTERED_EVALUATION_OBJECTS[ eval_name](eval_config, self.mode) @property def evaluation_names(self): # NOTE: evaluation_types from config; evaluation_names from object instance. return self.configs.chosen_evaluation_names @property def metric_names(self): _metric_names = [] for _eval_name in self.configs.chosen_evaluation_names: if _eval_name in EVALUATION_DISPLAY_NAMES: _display_eval_name = EVALUATION_DISPLAY_NAMES[_eval_name] else: _display_eval_name = _eval_name _metric_name_per_evaluation = self.evaluations[ _eval_name].metric_names for _metric_name in _metric_name_per_evaluation: _metric_name = '{}/{}'.format(_display_eval_name, _metric_name) _metric_names.append(_metric_name) return _metric_names def add_instance_id_and_embedding(self, instance_id, label_id, embedding, probability=None): """Add embedding and label for a sample to be used for evaluation. If the query attribute names are given in config, this function will search them on database automatically. Args: instance_id, integer: A integer identifier for the image. instance_id label_id: An interger to describe class embedding, list or numpy array: Embedding, feature vector """ # NOTE: If we call classification, then add probability. # TODO @kv: If instance_id is None, use index as default. if instance_id is None or instance_id == -1: instance_id = self._instance_counter if not isinstance(instance_id, int): instance_id = int(instance_id) if not isinstance(label_id, int): label_id = int(label_id) if self.query_interface: queried_attributes = self.query_interface.query(instance_id) self.embedding_container.add(instance_id, label_id, embedding, probability, attribute=queried_attributes) else: self.embedding_container.add(instance_id, label_id, embedding, probability) # verbose for developing stage. if self.embedding_container.counts % 1000 == 0: if probability is None: print('{} embeddings are added.'.format( self.embedding_container.counts)) else: print('{} embeddings and probabilities are added.'.format( self.embedding_container.counts)) self._instance_counter += 1 def add_container(self, embedding_container=None): """Add filled containers which should be provided previously. Args: embedding_container: EmbeddingContainer, default is None. Notice: Sanity check: TODO @kv: Think about how to cooperate with attributes """ # replace container if embedding_container is not None: if not isinstance(embedding_container, EmbeddingContainer): # raise error return self.embedding_container.clear() self.embedding_container = embedding_container print('Update embedding container.') def evaluate(self): """Execute given evaluations and returns a dictionary of metrics. Return: total_metrics: A flatten dictionary for display each measures """ for _eval_name, _evaluation in self.evaluations.items(): # Pass the container to the evaluation objects. res_container = _evaluation.compute(self.embedding_container) self._results[_eval_name] = res_container # TODO: flatten results and return if _eval_name in EVALUATION_DISPLAY_NAMES: _display_name = EVALUATION_DISPLAY_NAMES[_eval_name] else: _display_name = _eval_name if res_container: self._total_metrics[_display_name] = res_container.flatten else: self._total_metrics[_display_name] = {} flatten = {} for _eval_name, _content in self._total_metrics.items(): for _metric, _value in _content.items(): _combined_name = '{}/{}'.format(_eval_name, _metric) flatten[_combined_name] = _value return flatten @property def results(self): return self._results def clear(self): """Clears the state to prepare for a fresh evaluation.""" self.embedding_container.clear() for _, _container in self._total_metrics.items(): _container.clear()