def build_extr_str(self): if self.extr_str is None: # use generic extractor 'build_extr_str' from cufacesearch.extractor.generic_extractor import build_extr_str # featurizer_type, detector_type, input_type): self.extr_str = build_extr_str(self.featurizer_type, self.detector_type, self.input_type) return self.extr_str
def __init__(self, global_conf, prefix=default_extr_proc_prefix): self.extractor = None self.nb_empt = 0 self.nb_err = 0 self.max_proc_time = 1200 # in seconds. Increased for sbcmdline... self.url_input = True super(ExtractionProcessor, self).__init__(global_conf, prefix) self.input_type = self.get_required_param("input_type") self.nb_threads = self.get_required_param("nb_threads") self.featurizer_type = self.get_required_param("featurizer_type") self.featurizer_prefix = self.get_required_param("featurizer_prefix") self.detector_type = self.get_required_param("detector_type") # Means we extract feature from the whole image if self.detector_type == "full": self.detector = None self.verbose = 0 verbose = self.get_param("verbose") if verbose: self.verbose = int(verbose) self.ingestion_input = "kafka" ingestion_input = self.get_param("ingestion_input") if ingestion_input: self.ingestion_input = ingestion_input file_input = self.get_param("file_input") print("[{}.ExtractionProcessor: log] file_input: {}".format(self.pp, file_input)) if file_input: self.url_input = False print("[{}.ExtractionProcessor: log] url_input: {}".format(self.pp, self.url_input)) if self.url_input: self.img_column = img_URL_column else: self.img_column = img_path_column print("[{}.ExtractionProcessor: log] img_column: {}".format(self.pp, self.img_column)) # Need to be build from extraction type and detection input + "_processed" self.extr_family_column = "ext" tmp_extr_family_column = self.get_param("extr_family_column") if tmp_extr_family_column: self.extr_family_column = tmp_extr_family_column self.push_back = False push_back = self.get_param("push_back") if push_back: self.push_back = True self.extr_prefix = build_extr_str(self.featurizer_type, self.detector_type, self.input_type) self.set_pp() # Initialize queues self.init_queues() # Initialize extractors only once (just one first) self.extractors = [] #for i in range(self.nb_threads): # self.extractors.append(GenericExtractor(self.detector_type, self.featurizer_type, self.input_type, # self.extr_family_column, self.featurizer_prefix, self.global_conf)) self.extractors.append(GenericExtractor(self.detector_type, self.featurizer_type, self.input_type, self.extr_family_column, self.featurizer_prefix, self.global_conf)) # Beware, the self.extr_family_column should be added to the indexer families parameter in get_create_table... # What if the table has some other column families?... self.tablesha1_col_families = {'info': dict(), self.extr_family_column: dict()} # Initialize indexer self.indexer = HBaseIndexerMinimal(self.global_conf, prefix=self.get_required_param("indexer_prefix")) self.last_update_date_id = '' # Initialize ingester self.ingester = GenericKafkaProcessor(self.global_conf, prefix=self.get_required_param("proc_ingester_prefix")) self.ingester.pp = "ep"
def __init__(self, global_conf, prefix=DEFAULT_EXTR_PROC_PREFIX): """ExtractionProcessor constructor :param global_conf_in: configuration file or dictionary :type global_conf_in: str, dict :param prefix: prefix in configuration :type prefix: str """ self.extractor = None self.nb_empt = 0 self.nb_err = 0 self.max_proc_time = 1200 # in seconds. Increased for sbcmdline... self.url_input = True super(ExtractionProcessor, self).__init__(global_conf, prefix) # TODO: move that to self.read_conf() # Get required parameters self.input_type = self.get_required_param("input_type") self.nb_threads = self.get_required_param("nb_threads") self.featurizer_type = self.get_required_param("featurizer_type") self.featurizer_prefix = self.get_required_param("featurizer_prefix") self.detector_type = self.get_required_param("detector_type") # Get optional parameters self.verbose = int(self.get_param("verbose", default=0)) self.ingestion_input = self.get_param("ingestion_input", default="kafka") self.push_back = self.get_param("push_back", default=False) file_input = self.get_param("file_input") print("[{}.ExtractionProcessor: log] file_input: {}".format( self.pp, file_input)) if file_input: self.url_input = False print("[{}.ExtractionProcessor: log] url_input: {}".format( self.pp, self.url_input)) # Means we extract feature from the whole image if self.detector_type == "full": self.detector = None self.extr_prefix = build_extr_str(self.featurizer_type, self.detector_type, self.input_type) self.set_pp() # Initialize queues self.init_queues() # Initialize indexer # We now have two indexers: # - one "in_indexer" for TF table with buffer, img URLs etc... # - one "out_indexer" for our table with extractions etc # NB: they could be the same if tables are merged... self.out_indexer = HBaseIndexerMinimal( self.global_conf, prefix=self.get_required_param("indexer_prefix")) prefix_in_indexer = self.get_param("in_indexer_prefix", default=False) if prefix_in_indexer: self.in_indexer = HBaseIndexerMinimal(self.global_conf, prefix=prefix_in_indexer) insha1tab = self.in_indexer.table_sha1infos_name insha1cfs = self.in_indexer.get_dictcf_sha1_table() print("[{}] 'in_indexer' sha1 table {} columns are: {}".format( self.pp, insha1tab, insha1cfs)) else: print( "[{}] empty 'in_indexer_prefix', using out_indexer as in_indexer too." .format(self.pp)) self.in_indexer = self.out_indexer # Initialize extractors only once (just one first) self.extractors = [] # DONE: use 'out_indexer' self.extractors.append( GenericExtractor(self.detector_type, self.featurizer_type, self.input_type, self.out_indexer.extrcf, self.featurizer_prefix, self.global_conf)) # DONE: use 'in_indexer' if self.url_input: self.img_column = self.in_indexer.get_col_imgurl() else: self.img_column = self.in_indexer.get_col_imgpath() img_cols = [ self.in_indexer.get_col_imgbuff(), self.in_indexer.get_col_imgurlbak(), self.img_column ] print("[{}.ExtractionProcessor: log] img_cols: {}".format( self.pp, img_cols)) self.last_update_date_id = '' # Initialize ingester self.ingester = GenericKafkaProcessor( self.global_conf, prefix=self.get_required_param("proc_ingester_prefix")) self.ingester.pp = "ep"