def __init__(self, global_conf, prefix=default_extr_check_prefix, pid=None): self.list_extr_prefix = [] self.pid = pid self.dict_sha1_infos = dict() super(ExtractionChecker, self).__init__(global_conf, prefix) self.featurizer_type = self.get_required_param("featurizer_type") self.detector_type = self.get_required_param("detector_type") self.input_type = self.get_required_param("input_type") # Need to be build from extraction type and detection input + "_processed" self.extr_family_column = "ext" tmp_extr_family_column = self.get_param("extr_family_column") if tmp_extr_family_column: self.extr_family_column = tmp_extr_family_column # Max delay self.max_delay = 3600 # self.max_delay = 600 max_delay = self.get_param("max_delay") if max_delay: self.max_delay = int(max_delay) self.last_push = time.time() self.nb_imgs_check = 0 self.nb_imgs_unproc = 0 self.nb_imgs_unproc_lastprint = 0 # Beware, the self.extr_family_column should be added to the indexer families parameter in get_create_table... # TODO: should we add the 'ad' column family too here by default self.tablesha1_col_families = {'info': dict(), self.extr_family_column: dict()} self.list_extr_prefix = [self.featurizer_type, "feat", self.detector_type, self.input_type] self.extr_prefix = "_".join(self.list_extr_prefix) self.extr_prefix_base_column_name = self.extr_family_column + ":" + self.extr_prefix self.extr_check_column = self.extr_prefix_base_column_name + "_processed" # Need to be build from extraction type and extraction input + "_batchid" self.batch_check_column = self.extr_prefix_base_column_name + "_updateid" self.check_columns = [self.extr_check_column, self.batch_check_column] self.set_pp() # Initialize indexer and ingester self.indexer = HBaseIndexerMinimal(self.global_conf, prefix=self.get_required_param("indexer_prefix")) self.ingester = GenericKafkaProcessor(self.global_conf, prefix=self.get_required_param("check_ingester_prefix")) # This will not be set for HBase processing, but checker would keep dying here... try: self.updates_out_topic = self.ingester.get_required_param("producer_updates_out_topic") except Exception as inst: print "Could not initialize checker, sleeping for {}s.".format(self.max_delay) time.sleep(self.max_delay) raise(inst) self.ingester.pp = "ec" if self.pid: self.ingester.pp += str(self.pid)
def __init__(self, global_conf, prefix=default_extr_proc_prefix): self.extractor = None self.nb_empt = 0 self.nb_err = 0 self.max_proc_time = 1200 # in seconds. Increased for sbcmdline... self.url_input = True super(ExtractionProcessor, self).__init__(global_conf, prefix) self.input_type = self.get_required_param("input_type") self.nb_threads = self.get_required_param("nb_threads") self.featurizer_type = self.get_required_param("featurizer_type") self.featurizer_prefix = self.get_required_param("featurizer_prefix") self.detector_type = self.get_required_param("detector_type") # Means we extract feature from the whole image if self.detector_type == "full": self.detector = None self.verbose = 0 verbose = self.get_param("verbose") if verbose: self.verbose = int(verbose) self.ingestion_input = "kafka" ingestion_input = self.get_param("ingestion_input") if ingestion_input: self.ingestion_input = ingestion_input file_input = self.get_param("file_input") print("[{}.ExtractionProcessor: log] file_input: {}".format(self.pp, file_input)) if file_input: self.url_input = False print("[{}.ExtractionProcessor: log] url_input: {}".format(self.pp, self.url_input)) if self.url_input: self.img_column = img_URL_column else: self.img_column = img_path_column print("[{}.ExtractionProcessor: log] img_column: {}".format(self.pp, self.img_column)) # Need to be build from extraction type and detection input + "_processed" self.extr_family_column = "ext" tmp_extr_family_column = self.get_param("extr_family_column") if tmp_extr_family_column: self.extr_family_column = tmp_extr_family_column self.push_back = False push_back = self.get_param("push_back") if push_back: self.push_back = True self.extr_prefix = build_extr_str(self.featurizer_type, self.detector_type, self.input_type) self.set_pp() # Initialize queues self.init_queues() # Initialize extractors only once (just one first) self.extractors = [] #for i in range(self.nb_threads): # self.extractors.append(GenericExtractor(self.detector_type, self.featurizer_type, self.input_type, # self.extr_family_column, self.featurizer_prefix, self.global_conf)) self.extractors.append(GenericExtractor(self.detector_type, self.featurizer_type, self.input_type, self.extr_family_column, self.featurizer_prefix, self.global_conf)) # Beware, the self.extr_family_column should be added to the indexer families parameter in get_create_table... # What if the table has some other column families?... self.tablesha1_col_families = {'info': dict(), self.extr_family_column: dict()} # Initialize indexer self.indexer = HBaseIndexerMinimal(self.global_conf, prefix=self.get_required_param("indexer_prefix")) self.last_update_date_id = '' # Initialize ingester self.ingester = GenericKafkaProcessor(self.global_conf, prefix=self.get_required_param("proc_ingester_prefix")) self.ingester.pp = "ep"
class ExtractionProcessor(ConfReader): def __init__(self, global_conf, prefix=default_extr_proc_prefix): self.extractor = None self.nb_empt = 0 self.nb_err = 0 self.max_proc_time = 1200 # in seconds. Increased for sbcmdline... self.url_input = True super(ExtractionProcessor, self).__init__(global_conf, prefix) self.input_type = self.get_required_param("input_type") self.nb_threads = self.get_required_param("nb_threads") self.featurizer_type = self.get_required_param("featurizer_type") self.featurizer_prefix = self.get_required_param("featurizer_prefix") self.detector_type = self.get_required_param("detector_type") # Means we extract feature from the whole image if self.detector_type == "full": self.detector = None self.verbose = 0 verbose = self.get_param("verbose") if verbose: self.verbose = int(verbose) self.ingestion_input = "kafka" ingestion_input = self.get_param("ingestion_input") if ingestion_input: self.ingestion_input = ingestion_input file_input = self.get_param("file_input") print("[{}.ExtractionProcessor: log] file_input: {}".format(self.pp, file_input)) if file_input: self.url_input = False print("[{}.ExtractionProcessor: log] url_input: {}".format(self.pp, self.url_input)) if self.url_input: self.img_column = img_URL_column else: self.img_column = img_path_column print("[{}.ExtractionProcessor: log] img_column: {}".format(self.pp, self.img_column)) # Need to be build from extraction type and detection input + "_processed" self.extr_family_column = "ext" tmp_extr_family_column = self.get_param("extr_family_column") if tmp_extr_family_column: self.extr_family_column = tmp_extr_family_column self.push_back = False push_back = self.get_param("push_back") if push_back: self.push_back = True self.extr_prefix = build_extr_str(self.featurizer_type, self.detector_type, self.input_type) self.set_pp() # Initialize queues self.init_queues() # Initialize extractors only once (just one first) self.extractors = [] #for i in range(self.nb_threads): # self.extractors.append(GenericExtractor(self.detector_type, self.featurizer_type, self.input_type, # self.extr_family_column, self.featurizer_prefix, self.global_conf)) self.extractors.append(GenericExtractor(self.detector_type, self.featurizer_type, self.input_type, self.extr_family_column, self.featurizer_prefix, self.global_conf)) # Beware, the self.extr_family_column should be added to the indexer families parameter in get_create_table... # What if the table has some other column families?... self.tablesha1_col_families = {'info': dict(), self.extr_family_column: dict()} # Initialize indexer self.indexer = HBaseIndexerMinimal(self.global_conf, prefix=self.get_required_param("indexer_prefix")) self.last_update_date_id = '' # Initialize ingester self.ingester = GenericKafkaProcessor(self.global_conf, prefix=self.get_required_param("proc_ingester_prefix")) self.ingester.pp = "ep" def set_pp(self): self.pp = "ExtractionProcessor" if self.extractor: self.pp += "_"+self.extr_prefix def init_queues(self): from multiprocessing import JoinableQueue self.q_in = [] self.q_out = [] for i in range(self.nb_threads): self.q_in.append(JoinableQueue(0)) self.q_out.append(JoinableQueue(0)) def get_batch_hbase(self): # legacy implementation: better to have a kafka topic for batches to be processed to allow # safe parallelization on different machines try: for updates in self.indexer.get_unprocessed_updates_from_date(self.last_update_date_id, extr_type=self.extr_prefix): for update_id, update_cols in updates: if self.extr_prefix in update_id: # double check update has not been processed somewhere else if self.is_update_unprocessed(update_id): # double check update was not marked as started recently i.e. by another process if self.is_update_notstarted(update_id, max_delay=TIME_ELAPSED_FAILED): list_sha1s = update_cols[column_list_sha1s].split(',') log_msg = "[{}.get_batch_hbase: log] Update {} has {} images." print(log_msg.format(self.pp, update_id, len(list_sha1s))) # also get 'ext:' to check if extraction was already processed? rows_batch = self.indexer.get_columns_from_sha1_rows(list_sha1s, columns=[img_buffer_column, self.img_column]) # print "rows_batch", rows_batch if rows_batch: yield rows_batch, update_id self.last_update_date_id = '_'.join(update_id.split('_')[-2:]) else: log_msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}" print(log_msg.format(self.pp, update_id)) else: log_msg = "[{}.get_batch_hbase: log] Skipping update started recently: {}" print(log_msg.format(self.pp, update_id)) continue else: log_msg = "[{}.get_batch_hbase: log] Skipping already processed update: {}" print(log_msg.format(self.pp, update_id)) continue else: if self.verbose > 6: log_msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type." print(log_msg.format(self.pp, update_id)) else: print("[{}.get_batch_hbase: log] No unprocessed update found.".format(self.pp)) # Look for updates that have some unprocessed images # TODO: wether we do that or not could be specified by a parameter # as this induces slow down during update... for updates in self.indexer.get_missing_extr_updates_from_date("1970-01-01", extr_type=self.extr_prefix): for update_id, update_cols in updates: if self.extr_prefix in update_id: if column_list_sha1s in update_cols: list_sha1s = update_cols[column_list_sha1s].split(',') log_msg = "[{}.get_batch_hbase: log] Update {} has {} images missing extractions." print(log_msg.format(self.pp, update_id, len(list_sha1s))) sys.stdout.flush() # also get 'ext:' to check if extraction was already processed? rows_batch = self.indexer.get_columns_from_sha1_rows(list_sha1s, columns=[img_buffer_column, self.img_column]) if rows_batch: yield rows_batch, update_id else: log_msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}" print(log_msg.format(self.pp, update_id)) else: log_msg = "[{}.get_batch_hbase: log] Update {} has no images list." print(log_msg.format(self.pp, update_id)) else: log_msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type." print(log_msg.format(self.pp, update_id)) else: log_msg = "[{}.get_batch_hbase: log] No updates with missing extractions found." print(log_msg.format(self.pp)) sys.stdout.flush() except Exception as inst: full_trace_error("[{}.get_batch_hbase: error] {}".format(self.pp, inst)) def is_update_unprocessed(self, update_id): update_rows = self.indexer.get_rows_by_batch([update_id], table_name=self.indexer.table_updateinfos_name) if update_rows: for row in update_rows: if info_column_family+":"+update_str_processed in row[1]: return False return True def is_update_notstarted(self, update_id, max_delay=None): """Check if an update was not started yet. :param update_id: update id :param max_delay: delay (in seconds) between marked start time and now to consider update failed :return: boolean """ update_rows = self.indexer.get_rows_by_batch([update_id], table_name=self.indexer.table_updateinfos_name) if update_rows: for row in update_rows: if info_column_family+":"+update_str_started in row[1]: if max_delay: # check that started was mark recently, if not it may mean the update processing failed start_str = row[1][info_column_family+":"+update_str_started] # start time format is '%Y-%m-%d:%H.%M.%S' start_dt = datetime.strptime(start_str, '%Y-%m-%d:%H.%M.%S') now_dt = datetime.now() diff_dt = now_dt - start_dt if diff_dt.total_seconds() > max_delay: return True else: return False else: return False return True def get_batch_kafka(self): # Read from a kafka topic to allow safer parallelization on different machines try: # Needs to read topic to get update_id and list of sha1s for msg in self.ingester.consumer: msg_dict = json.loads(msg.value) update_id = msg_dict.keys()[0] # NB: Try to get update info and check it was really not processed yet. if self.is_update_unprocessed(update_id): str_list_sha1s = msg_dict[update_id] list_sha1s = str_list_sha1s.split(',') print("[{}.get_batch_kafka: log] Update {} has {} images.".format(self.pp, update_id, len(list_sha1s))) # NB: we could also get 'ext:' of images to double check if extraction was already processed #rows_batch = self.indexer.get_columns_from_sha1_rows(list_sha1s, columns=["info:img_buffer"]) if self.verbose > 3: print("[{}.get_batch_kafka: log] Looking for colums: {}".format(self.pp, [img_buffer_column, self.img_column])) rows_batch = self.indexer.get_columns_from_sha1_rows(list_sha1s, columns=[img_buffer_column, self.img_column]) #print "rows_batch", rows_batch if rows_batch: if self.verbose > 4: print("[{}.get_batch_kafka: log] Yielding for update: {}".format(self.pp, update_id)) yield rows_batch, update_id self.ingester.consumer.commit() if self.verbose > 4: print("[{}.get_batch_kafka: log] After yielding for update: {}".format(self.pp, update_id)) self.last_update_date_id = '_'.join(update_id.split('_')[-2:]) # Should we try to commit offset only at this point? else: print("[{}.get_batch_kafka: log] Did not get any image buffers for the update: {}".format(self.pp, update_id)) else: print("[{}.get_batch_kafka: log] Skipping already processed update: {}".format(self.pp, update_id)) else: print("[{}.get_batch_kafka: log] No update found.".format(self.pp)) # Fall back to checking HBase for unstarted/unfinished updates for rows_batch, update_id in self.get_batch_hbase(): yield rows_batch, update_id except Exception as inst: full_trace_error("[{}.get_batch_kafka: error] {}".format(self.pp, inst)) def get_batch(self): if self.ingestion_input == "hbase": for rows_batch, update_id in self.get_batch_hbase(): yield rows_batch, update_id else: for rows_batch, update_id in self.get_batch_kafka(): yield rows_batch, update_id def process_batch(self): # Get a new update batch for rows_batch, update_id in self.get_batch(): try: start_update = time.time() print("[{}] Processing update {} of {} rows.".format(self.pp, update_id, len(rows_batch))) sys.stdout.flush() # Initialize self.nb_empt = 0 self.init_queues() threads = [] # If we deleted an extractor at some point or for first batch nb_extr_to_create = self.nb_threads - len(self.extractors) if nb_extr_to_create: start_create_extractor = time.time() while len(self.extractors) < self.nb_threads: self.extractors.append(GenericExtractor(self.detector_type, self.featurizer_type, self.input_type, self.extr_family_column, self.featurizer_prefix, self.global_conf)) buff_msg = "[{}] Created {} extractors in {}s." create_extr_time = time.time() - start_create_extractor print(buff_msg.format(self.pp, nb_extr_to_create, create_extr_time)) # Mark batch as started to be process now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S') dict_val = {info_column_family + ':' + update_str_started: now_str} update_started_dict = {update_id: dict_val} self.indexer.push_dict_rows(dict_rows=update_started_dict, table_name=self.indexer.table_updateinfos_name) # Push images to queue list_in = [] # For parallelized downloading... from Queue import Queue nb_imgs_dl = 0 q_in_dl = Queue(0) q_out_dl = Queue(0) start_get_buffer = time.time() for img in rows_batch: # should decode base64 if img_buffer_column in img[1]: tup = (img[0], img[1][img_buffer_column], False) list_in.append(tup) else: # need to re-download, accumulate a list of URLs to download # Deal with img_path_column for local_images_kafka_pusher if self.img_column in img[1]: q_in_dl.put((img[0], img[1][self.img_column], self.push_back)) nb_imgs_dl += 1 else: print("[{}: warning] No buffer and no URL/path for image {} !".format(self.pp, img[0])) continue # Download missing images if nb_imgs_dl > 0: threads_dl = [] for i in range(min(self.nb_threads, nb_imgs_dl)): # should read (url, obj_pos) from self.q_in # and push (url, obj_pos, buffer, img_info, start_process, end_process) to self.q_out thread = ThreadedDownloaderBufferOnly(q_in_dl, q_out_dl, url_input=self.url_input) thread.start() threads_dl.append(thread) q_in_dl.join() # Push downloaded images to list_in too nb_dl = 0 while nb_dl < nb_imgs_dl: # This can block? #sha1, buffer, push_back, inst = q_out_dl.get() try: sha1, buffer, push_back, inst = q_out_dl.get(True, 10) except Exception as queue_err: warn_msg = "[{}: error] Download queue out timed out: {}" print(warn_msg.format(self.pp, queue_err)) break nb_dl += 1 if inst: if self.verbose > 0: log_msg = "[{}: log] Could not download image {}, error was: {}" print(log_msg.format(self.pp, sha1, inst)) else: if buffer: list_in.append((sha1, buffer, push_back)) else: # Is that possible? err_msg = "[{}: error] No error but no buffer either for image {}" print(err_msg.format(self.pp, sha1)) get_buffer_time = time.time() - start_get_buffer buff_msg = "[{}] Got {}/{} image buffers for update {} in {}s." print(buff_msg.format(self.pp, len(list_in), len(rows_batch), update_id, get_buffer_time)) sys.stdout.flush() q_batch_size = int(math.ceil(float(len(list_in))/self.nb_threads)) for i, q_batch in enumerate(build_batch(list_in, q_batch_size)): self.q_in[i].put(q_batch) q_in_size = [] q_in_size_tot = 0 for i in range(self.nb_threads): q_in_size.append(self.q_in[i].qsize()) q_in_size_tot += q_in_size[i] if self.verbose > 5: print("[{}] Total input queues sizes is: {}".format(self.pp, q_in_size_tot)) # Start daemons... thread_creation_failed = [0]*self.nb_threads for i in range(self.nb_threads): # one per non empty input queue if q_in_size[i] > 0: try: thread = DaemonBatchExtractor(self.extractors[i], self.q_in[i], self.q_out[i], verbose=self.verbose) # Could get a 'Cannot allocate memory' if we are using too many threads... thread.start() threads.append(thread) except OSError as inst: # Should we try to push self.q_in[i] data to some other thread? print("[{}.process_batch: error] Could not start thread #{}: {}".format(self.pp, i+1, inst)) thread_creation_failed[i] = 1 time.sleep(10*sum(thread_creation_failed)) if sum(thread_creation_failed) == self.nb_threads: raise ValueError("Could not start any thread...") nb_threads_running = len(threads) start_process = time.time() stop = time.time() + self.max_proc_time # Wait for all tasks to be marked as done threads_finished = [0] * nb_threads_running deleted_extr = [0] * nb_threads_running thread_msg = "[{}] Thread {}/{} (pid: {}) " while sum(threads_finished) < nb_threads_running: for i in range(nb_threads_running): if sum(threads_finished) == nb_threads_running: sys.stdout.flush() break if threads_finished[i] == 1: continue i_q_in = i + sum(thread_creation_failed[:i + 1]) if q_in_size[i_q_in] > 0: # This seems to block forever sometimes, if subprocess crashed?... #self.q_in[i].join() # Manual join with timeout... # https://github.com/python/cpython/blob/3.6/Lib/multiprocessing/queues.py if not self.q_in[i_q_in]._unfinished_tasks._semlock._is_zero() and time.time() < stop: time.sleep(1) else: if self.q_in[i_q_in]._unfinished_tasks._semlock._is_zero(): if self.verbose > 5: msg = thread_msg+"marked as finished because processing seems finished" print(msg.format(self.pp, i+1, nb_threads_running, threads[i].pid)) else: if self.verbose > 0: # In this cases does this happen... msg = thread_msg+"force marked task as done as max_proc_time ({}) has passed." print(msg.format(self.pp, i+1, nb_threads_running, threads[i].pid, self.max_proc_time)) sys.stdout.flush() # Try to delete corresponding extractor to free memory? # And reduce number of threads at the end of the loop try: self.q_in[i_q_in].task_done() if deleted_extr[i] == 0: # we pushed the extractor as self.extractors[i] in a loop of self.nb_threads # we use i_q_in del self.extractors[i_q_in] deleted_extr[i] = 1 except Exception: pass threads_finished[i] = 1 else: if self.verbose > 2: # We actually never gave something to process... msg = thread_msg+"marked as finished because no data was passed to it" print(msg.format(self.pp, i+1, nb_threads_running, threads[i].pid)) threads_finished[i] = 1 # Cleanup threads to free memory before getting data back # Daemon may still be running... # and will actually be deleted only when they exit after not getting a batch del threads # Gather results q_out_size = [] q_out_size_tot = 0 for i in range(self.nb_threads): q_out_size.append(self.q_out[i].qsize()) q_out_size_tot += q_out_size[i] if self.verbose > 5: print("[{}: log] Total output queues size is: {}".format(self.pp, q_out_size_tot)) sys.stdout.flush() # Can get stuck here? dict_imgs = dict() for i in range(self.nb_threads): if self.verbose > 4: print("[{}] Thread {} q_out_size: {}".format(self.pp, i+1, q_out_size[i])) sys.stdout.flush() while q_out_size[i]>0 and not self.q_out[i].empty(): if self.verbose > 6: print("[{}] Thread {} q_out is not empty.".format(self.pp, i + 1)) sys.stdout.flush() try: batch_out = self.q_out[i].get(True, 10) if self.verbose > 4: msg = "[{}] Got batch of {} features from thread {} q_out." print(msg.format(self.pp, len(batch_out), i + 1)) sys.stdout.flush() for sha1, dict_out in batch_out: dict_imgs[sha1] = dict_out except: if self.verbose > 1: print("[{}] Thread {} failed to get from q_out: {}".format(self.pp, i+1)) sys.stdout.flush() pass if self.verbose > 4: print("[{}] Marking task done in q_out of thread {}.".format(self.pp, i + 1)) sys.stdout.flush() self.q_out[i].task_done() #if self.verbose > 0: print_msg = "[{}] Got features for {}/{} images in {}s." proc_time = time.time() - start_process print(print_msg.format(self.pp, len(dict_imgs.keys()), len(list_in), proc_time)) sys.stdout.flush() # Push them self.indexer.push_dict_rows(dict_rows=dict_imgs, table_name=self.indexer.table_sha1infos_name) # Mark batch as processed update_processed_dict = {update_id: {info_column_family + ':' + update_str_processed: datetime.now().strftime('%Y-%m-%d:%H.%M.%S')}} self.indexer.push_dict_rows(dict_rows=update_processed_dict, table_name=self.indexer.table_updateinfos_name) # Mark as completed if all rows had an extraction if len(rows_batch) == len(dict_imgs.keys()): update_completed_dict = {update_id: {info_column_family + ':' + update_str_completed: str(1)}} self.indexer.push_dict_rows(dict_rows=update_completed_dict, table_name=self.indexer.table_updateinfos_name) # Cleanup del self.q_in del self.q_out # To try to adjust a too optimistic nb_threads setting # if (sum(thread_creation_failed) > 0 or sum(deleted_extr) > 0) and self.nb_threads > 2: # self.nb_threads -= 1 print_msg = "[{}] Completed update {} in {}s." print(print_msg.format(self.pp, update_id, time.time() - start_update)) sys.stdout.flush() self.nb_err = 0 # Force garbage collection? gc.collect() # Should we just raise an Exception and restart clean? if sum(thread_creation_failed) > 0 or sum(deleted_extr) > 0: raise ValueError("Something went wrong. Trying to restart clean") except Exception as inst: exc_type, exc_obj, exc_tb = sys.exc_info() fulltb = traceback.format_tb(exc_tb) raise type(inst)(" {} ({})".format(inst, ''.join(fulltb))) def run(self): self.nb_empt = 0 self.nb_err = 0 while True: self.process_batch() print("[ExtractionProcessor: log] Nothing to process at: {}".format(datetime.now().strftime('%Y-%m-%d:%H.%M.%S'))) sys.stdout.flush() time.sleep(10*self.nb_empt) self.nb_empt += 1
"/home/ubuntu/caffe_cpu/build/tools/extract_nfeatures", "HBI_host": "10.1.94.57", "HBI_table_sha1infos": "escorts_images_sha1_infos_from_ts" } pyconf = { "SBPYCAFFEIMGFEAT_sbcaffe_path": "./data/caffe_sentibank_train_iter_250000", "SBPYCAFFEIMGFEAT_imgmean_path": "./data/imagenet_mean.npy", } diffs = [] rows = [] if list_sha1s[0]: hbi = HBaseIndexerMinimal(conf, prefix="HBI_") rows = hbi.get_columns_from_sha1_rows( list_sha1s, columns=["info:featnorm_cu", "info:s3_url"]) sbclif = SentiBankCmdLineImgFeaturizer(conf) sbpcif = SentiBankPyCaffeImgFeaturizer(pyconf) for row in rows: feat_hbase_b64 = featB64decode(row[1]["info:featnorm_cu"]) #print feat_hbase_b64.shape img_url = row[1]["info:s3_url"] start_extr = time.time() img_buffer = get_buffer_from_URL(img_url) feat, data = sbclif.featurize(img_buffer, sha1=row[0]) img_buffer.seek(0) pydata = sbpcif.preprocess_img(img_buffer) fpydata = pydata.flatten()
def __init__(self, global_conf, prefix=DEFAULT_EXTR_PROC_PREFIX): """ExtractionProcessor constructor :param global_conf_in: configuration file or dictionary :type global_conf_in: str, dict :param prefix: prefix in configuration :type prefix: str """ self.extractor = None self.nb_empt = 0 self.nb_err = 0 self.max_proc_time = 1200 # in seconds. Increased for sbcmdline... self.url_input = True super(ExtractionProcessor, self).__init__(global_conf, prefix) # TODO: move that to self.read_conf() # Get required parameters self.input_type = self.get_required_param("input_type") self.nb_threads = self.get_required_param("nb_threads") self.featurizer_type = self.get_required_param("featurizer_type") self.featurizer_prefix = self.get_required_param("featurizer_prefix") self.detector_type = self.get_required_param("detector_type") # Get optional parameters self.verbose = int(self.get_param("verbose", default=0)) self.ingestion_input = self.get_param("ingestion_input", default="kafka") self.push_back = self.get_param("push_back", default=False) file_input = self.get_param("file_input") print("[{}.ExtractionProcessor: log] file_input: {}".format( self.pp, file_input)) if file_input: self.url_input = False print("[{}.ExtractionProcessor: log] url_input: {}".format( self.pp, self.url_input)) # Means we extract feature from the whole image if self.detector_type == "full": self.detector = None self.extr_prefix = build_extr_str(self.featurizer_type, self.detector_type, self.input_type) self.set_pp() # Initialize queues self.init_queues() # Initialize indexer # We now have two indexers: # - one "in_indexer" for TF table with buffer, img URLs etc... # - one "out_indexer" for our table with extractions etc # NB: they could be the same if tables are merged... self.out_indexer = HBaseIndexerMinimal( self.global_conf, prefix=self.get_required_param("indexer_prefix")) prefix_in_indexer = self.get_param("in_indexer_prefix", default=False) if prefix_in_indexer: self.in_indexer = HBaseIndexerMinimal(self.global_conf, prefix=prefix_in_indexer) insha1tab = self.in_indexer.table_sha1infos_name insha1cfs = self.in_indexer.get_dictcf_sha1_table() print("[{}] 'in_indexer' sha1 table {} columns are: {}".format( self.pp, insha1tab, insha1cfs)) else: print( "[{}] empty 'in_indexer_prefix', using out_indexer as in_indexer too." .format(self.pp)) self.in_indexer = self.out_indexer # Initialize extractors only once (just one first) self.extractors = [] # DONE: use 'out_indexer' self.extractors.append( GenericExtractor(self.detector_type, self.featurizer_type, self.input_type, self.out_indexer.extrcf, self.featurizer_prefix, self.global_conf)) # DONE: use 'in_indexer' if self.url_input: self.img_column = self.in_indexer.get_col_imgurl() else: self.img_column = self.in_indexer.get_col_imgpath() img_cols = [ self.in_indexer.get_col_imgbuff(), self.in_indexer.get_col_imgurlbak(), self.img_column ] print("[{}.ExtractionProcessor: log] img_cols: {}".format( self.pp, img_cols)) self.last_update_date_id = '' # Initialize ingester self.ingester = GenericKafkaProcessor( self.global_conf, prefix=self.get_required_param("proc_ingester_prefix")) self.ingester.pp = "ep"
class ExtractionProcessor(ConfReader): """ExtractionProcessor class """ def __init__(self, global_conf, prefix=DEFAULT_EXTR_PROC_PREFIX): """ExtractionProcessor constructor :param global_conf_in: configuration file or dictionary :type global_conf_in: str, dict :param prefix: prefix in configuration :type prefix: str """ self.extractor = None self.nb_empt = 0 self.nb_err = 0 self.max_proc_time = 900 # in seconds. Increased for sbcmdline... self.url_input = True super(ExtractionProcessor, self).__init__(global_conf, prefix) # TODO: move that to self.read_conf() # Get required parameters self.input_type = self.get_required_param("input_type") self.nb_threads = self.get_required_param("nb_threads") self.featurizer_type = self.get_required_param("featurizer_type") self.featurizer_prefix = self.get_required_param("featurizer_prefix") self.detector_type = self.get_required_param("detector_type") # Get optional parameters self.verbose = int(self.get_param("verbose", default=0)) self.maxucme = int( self.get_param("max_up_check_miss_extr", default=MAX_UP_CHECK_MISS_EXTR)) self.ingestion_input = self.get_param("ingestion_input", default="kafka") self.push_back = self.get_param("push_back", default=False) file_input = self.get_param("file_input") print("[{}.ExtractionProcessor: log] file_input: {}".format( self.pp, file_input)) if file_input: self.url_input = False print("[{}.ExtractionProcessor: log] url_input: {}".format( self.pp, self.url_input)) # Means we extract feature from the whole image if self.detector_type == "full": self.detector = None self.extr_prefix = build_extr_str(self.featurizer_type, self.detector_type, self.input_type) self.set_pp() # Initialize queues self.init_queues() # Initialize indexer # We now have two indexers: # - one "in_indexer" for TF table with buffer, img URLs etc... # - one "out_indexer" for our table with extractions etc # NB: they could be the same if tables are merged... self.out_indexer = HBaseIndexerMinimal( self.global_conf, prefix=self.get_required_param("indexer_prefix")) self.out_indexer.pp = "ProcOutHBase" prefix_in_indexer = self.get_param("in_indexer_prefix", default=False) if prefix_in_indexer: self.in_indexer = HBaseIndexerMinimal(self.global_conf, prefix=prefix_in_indexer) self.in_indexer.pp = "ProcInHBase" insha1tab = self.in_indexer.table_sha1infos_name insha1cfs = self.in_indexer.get_dictcf_sha1_table() print("[{}] 'in_indexer' sha1 table {} columns are: {}".format( self.pp, insha1tab, insha1cfs)) else: print( "[{}] empty 'in_indexer_prefix', using out_indexer as in_indexer too." .format(self.pp)) self.in_indexer = self.out_indexer self.in_indexer.pp = "ProcInOutHBase" # Initialize extractors only once (just one first) self.extractors = [] # DONE: use 'out_indexer' self.extractors.append( GenericExtractor(self.detector_type, self.featurizer_type, self.input_type, self.out_indexer.extrcf, self.featurizer_prefix, self.global_conf)) # DONE: use 'in_indexer' if self.url_input: self.img_column = self.in_indexer.get_col_imgurl() else: self.img_column = self.in_indexer.get_col_imgpath() img_cols = [ self.in_indexer.get_col_imgbuff(), self.in_indexer.get_col_imgurlbak(), self.img_column ] print("[{}.ExtractionProcessor: log] img_cols: {}".format( self.pp, img_cols)) self.last_update_date_id = "1970-01-01" self.last_missing_extr_date = "1970-01-01" # Initialize ingester self.ingester = GenericKafkaProcessor( self.global_conf, prefix=self.get_required_param("proc_ingester_prefix")) self.ingester.pp = "ep" def set_pp(self, pp="ExtractionProcessor"): """Set pretty name :param pp: pretty name prefix :type pp: str """ self.pp = pp if self.extractor: self.pp += "_" + self.extr_prefix def init_queues(self): """Initialize queues list ``self.q_in`` and ``self.q_out`` """ from multiprocessing import JoinableQueue self.q_in = [] self.q_out = [] for _ in range(self.nb_threads): self.q_in.append(JoinableQueue(0)) self.q_out.append(JoinableQueue(0)) # Should these two methods be in indexer? def is_update_unprocessed(self, update_id): """Check if an update was not processed yet :param update_id: update id :type update_id: str :return: boolean indicated if update ``update_id`` is unprocessed :rtype: bool """ # DONE: use out_indexer update_rows = self.out_indexer.get_rows_by_batch( [update_id], table_name=self.out_indexer.table_updateinfos_name) if update_rows: for row in update_rows: if self.out_indexer.get_col_upproc() in row[1]: return False return True def is_update_notstarted(self, update_id, max_delay=None): """Check if an update was not started yet :param update_id: update id :type update_id: str :param max_delay: delay (in seconds) between marked start time and now to consider update failed :type max_delay: int :return: boolean :rtype: bool """ # DONE: use out_indexer update_rows = self.out_indexer.get_rows_by_batch( [update_id], table_name=self.out_indexer.table_updateinfos_name) if update_rows: for row in update_rows: # changed to: self.column_update_started #if info_column_family+":"+update_str_started in row[1]: #if self.column_update_started in row[1]: # DONE: use out_indexer if self.out_indexer.get_col_upstart() in row[1]: if max_delay: start_str = row[1][self.out_indexer.get_col_upstart()] # start time format is '%Y-%m-%d:%H.%M.%S' start_dt = datetime.strptime(start_str, '%Y-%m-%d:%H.%M.%S') now_dt = datetime.now() diff_dt = now_dt - start_dt if diff_dt.total_seconds() > max_delay: return True return False return True def get_batch_hbase(self): """Get one batch of images from HBase :yield: tuple (rows_batch, update_id) """ # legacy implementation: better to have a kafka topic for batches to be processed to allow # safe and efficient parallelization on different machines # DONE: use in_indexer img_cols = [ self.in_indexer.get_col_imgbuff(), self.in_indexer.get_col_imgurlbak(), self.img_column ] try: # DONE: use out_indexer for updates in self.out_indexer.get_unprocessed_updates_from_date( self.last_update_date_id, extr_type=self.extr_prefix): for update_id, update_cols in updates: if self.extr_prefix in update_id: # double check update has not been processed somewhere else if self.is_update_unprocessed(update_id): # double check update was not marked as started recently i.e. by another process if self.is_update_notstarted( update_id, max_delay=TIME_ELAPSED_FAILED): # DONE: use out_indexer list_sha1s = update_cols[ self.out_indexer.get_col_listsha1s( )].split(',') msg = "[{}.get_batch_hbase: log] Update {} has {} images." print( msg.format(self.pp, update_id, len(list_sha1s))) # We should time that, it seems slow i.e. 2/3 minutes per update. try: rows_batch = self.in_indexer.get_columns_from_sha1_rows( list_sha1s, rbs=BATCH_SIZE_IMGBUFFER, columns=img_cols) except Exception: msg = "[{}.get_batch_hbase: warning] Failed retrieving images data for update: {}" print(msg.format(self.pp, update_id)) # flush? sys.stdout.flush() # Update self.last_update_date_id ? #self.last_update_date_id = '_'.join(update_id.split('_')[-2:]) continue # print "rows_batch", rows_batch if rows_batch: yield rows_batch, update_id else: msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}" print(msg.format(self.pp, update_id)) #msg = "[{}.get_batch_hbase: log] Was trying to read columns {} from table {} for rows {}" #print(msg.format(self.pp, img_cols, self.in_indexer.table_sha1infos_name, list_sha1s)) # Store last update id self.last_update_date_id = '_'.join( update_id.split('_')[-2:]) else: msg = "[{}.get_batch_hbase: log] Skipping update started recently: {}" print(msg.format(self.pp, update_id)) continue else: msg = "[{}.get_batch_hbase: log] Skipping already processed update: {}" print(msg.format(self.pp, update_id)) continue else: if self.verbose > 6: msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type." print(msg.format(self.pp, update_id)) else: print("[{}.get_batch_hbase: log] No unprocessed update found.". format(self.pp)) # Should we reinitialized self.last_update_date_id? # Look for updates that have some unprocessed images # TODO: wether we do that or not could be specified by a parameter # as this induces slow down during update... # DONE: use out_indexer count_ucme = 0 stop_cme = False for updates in self.out_indexer.get_missing_extr_updates_from_date( self.last_missing_extr_date, extr_type=self.extr_prefix): for update_id, update_cols in updates: if self.extr_prefix in update_id: # DONE: use out_indexer if self.out_indexer.get_col_listsha1s( ) in update_cols: list_sha1s = update_cols[ self.out_indexer.get_col_listsha1s( )].split(',') msg = "[{}.get_batch_hbase: log] Update {} has {} images missing extractions." print( msg.format(self.pp, update_id, len(list_sha1s))) sys.stdout.flush() # also get 'ext:' to check if extraction was already processed? # DONE: use in_indexer rows_batch = self.in_indexer.get_columns_from_sha1_rows( list_sha1s, rbs=BATCH_SIZE_IMGBUFFER, columns=img_cols) if rows_batch: yield rows_batch, update_id self.last_missing_extr_date = '_'.join( update_id.split('_')[-2:]) count_ucme += 1 if count_ucme >= self.maxucme: stop_cme = True break else: msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}" print(msg.format(self.pp, update_id)) else: msg = "[{}.get_batch_hbase: log] Update {} has no images list." print(msg.format(self.pp, update_id)) else: msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type." print(msg.format(self.pp, update_id)) # We have reached maximum number of check for missing extractions in one call if stop_cme: break else: if stop_cme: msg = "[{}.get_batch_hbase: log] Stopped checking updates with missing extractions" msg += "after founding {}/{}." print( msg.format(self.pp, count_ucme, self.maxucme, self.last_missing_extr_date)) msg = "[{}.get_batch_hbase: log] Will restart next time from: {}" print(msg.format(self.pp, self.last_missing_extr_date)) sys.stdout.flush() else: msg = "[{}.get_batch_hbase: log] No updates with missing extractions found." print(msg.format(self.pp)) sys.stdout.flush() # Re-initialize dates just to make sure we don't miss anything self.last_update_date_id = "1970-01-01" self.last_missing_extr_date = "1970-01-01" except Exception as inst: # If we reach this point it is really a succession of failures full_trace_error("[{}.get_batch_hbase: error] {}".format( self.pp, inst)) # Raise Exception to restart process or docker raise inst def get_batch_kafka(self): """Get one batch of images from Kafka :yield: tuple (rows_batch, update_id) """ # Read from a kafka topic to allow safer parallelization on different machines # DONE: use in_indexer img_cols = [ self.in_indexer.get_col_imgbuff(), self.in_indexer.get_col_imgurlbak(), self.img_column ] try: # Needs to read topic to get update_id and list of sha1s if self.ingester.consumer: for msg in self.ingester.consumer: msg_dict = json.loads(msg.value) update_id = msg_dict.keys()[0] # NB: Try to get update info and check it was really not processed yet. if self.is_update_unprocessed(update_id): str_list_sha1s = msg_dict[update_id] list_sha1s = str_list_sha1s.split(',') msg = "[{}.get_batch_kafka: log] Update {} has {} images." print(msg.format(self.pp, update_id, len(list_sha1s))) if self.verbose > 3: msg = "[{}.get_batch_kafka: log] Looking for columns: {}" print(msg.format(self.pp, img_cols)) # DONE: use in_indexer #rows_batch = self.in_indexer.get_columns_from_sha1_rows(list_sha1s, columns=img_cols) rows_batch = self.in_indexer.get_columns_from_sha1_rows( list_sha1s, rbs=BATCH_SIZE_IMGBUFFER, columns=img_cols) #print "rows_batch", rows_batch if rows_batch: if self.verbose > 4: msg = "[{}.get_batch_kafka: log] Yielding for update: {}" print(msg.format(self.pp, update_id)) yield rows_batch, update_id self.ingester.consumer.commit() if self.verbose > 4: msg = "[{}.get_batch_kafka: log] After yielding for update: {}" print(msg.format(self.pp, update_id)) self.last_update_date_id = '_'.join( update_id.split('_')[-2:]) # Should we try to commit offset only at this point? else: msg = "[{}.get_batch_kafka: log] Did not get any image buffers for the update: {}" print(msg.format(self.pp, update_id)) else: msg = "[{}.get_batch_kafka: log] Skipping already processed update: {}" print(msg.format(self.pp, update_id)) else: print("[{}.get_batch_kafka: log] No update found.".format( self.pp)) # Fall back to checking HBase for unstarted/unfinished updates for rows_batch, update_id in self.get_batch_hbase(): yield rows_batch, update_id else: print("[{}.get_batch_kafka: log] No consumer found.".format( self.pp)) # Fall back to checking HBase for unstarted/unfinished updates for rows_batch, update_id in self.get_batch_hbase(): yield rows_batch, update_id except Exception as inst: # If we reach this point it is really a succession of failures full_trace_error("[{}.get_batch_kafka: error] {}".format( self.pp, inst)) # Raise Exception to restart process or docker raise inst def get_batch(self): """Get one batch of images :yield: tuple (rows_batch, update_id) """ if self.ingestion_input == "hbase": for rows_batch, update_id in self.get_batch_hbase(): yield rows_batch, update_id else: for rows_batch, update_id in self.get_batch_kafka(): yield rows_batch, update_id def process_batch(self): """Process one batch of images :raises Exception: if something goes really wrong """ # Get a new update batch try: for rows_batch, update_id in self.get_batch(): start_update = time.time() img_list_size = len(rows_batch) print("[{}] Processing update {} of {} rows.".format( self.pp, update_id, img_list_size)) sys.stdout.flush() # Initialize self.nb_empt = 0 self.init_queues() threads = [] # If we have deleted an extractor at some point or for first batch nb_extr_to_create = self.nb_threads - len(self.extractors) if nb_extr_to_create: start_create_extractor = time.time() while len(self.extractors) < min(self.nb_threads, img_list_size): # DONE: use 'out_indexer' self.extractors.append( GenericExtractor(self.detector_type, self.featurizer_type, self.input_type, self.out_indexer.extrcf, self.featurizer_prefix, self.global_conf)) msg = "[{}] Created {} extractors in {}s." create_extr_time = time.time() - start_create_extractor print( msg.format(self.pp, len(self.extractors), create_extr_time)) # Mark batch as started to be process now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S') # changed to: self.column_update_started #dict_val = {info_column_family + ':' + update_str_started: now_str} #dict_val = {self.column_update_started: now_str} # DONE: use out_indexer dict_val = {self.out_indexer.get_col_upstart(): now_str} update_started_dict = {update_id: dict_val} # DONE: use out_indexer self.out_indexer.push_dict_rows( dict_rows=update_started_dict, table_name=self.out_indexer.table_updateinfos_name) # TODO: define a get_buffer_images method # -------- # Push images to queue list_in = [] # For parallelized downloading... from Queue import Queue nb_imgs_dl = 0 q_in_dl = Queue(0) q_out_dl = Queue(0) start_get_buffer = time.time() # DONE: use in_indexer in all this scope # How could we transfer URL from in table to out table if they are different?... for img in rows_batch: # should decode base64 #if img_buffer_column in img[1]: if self.in_indexer.get_col_imgbuff() in img[1]: # That's messy... # b64buffer = buffer_to_B64(cStringIO.StringIO(img[1][self.in_indexer.get_col_imgbuff()])) # use img[1].pop(self.in_indexer.get_col_imgbuff()) b64buffer = buffer_to_B64( cStringIO.StringIO(img[1].pop( self.in_indexer.get_col_imgbuff()))) tup = (img[0], b64buffer, False) list_in.append(tup) else: # need to re-download, accumulate a list of URLs to download if self.verbose > 5: msg = "[{}: log] Will try to download image {} without buffer" print(msg.format(self.pp, img[0])) # Deal with img_path_column for local_images_kafka_pusher if self.img_column in img[1]: q_in_dl.put((img[0], img[1][self.img_column], self.push_back)) nb_imgs_dl += 1 elif self.in_indexer.get_col_imgurlbak() in img[1]: q_in_dl.put( (img[0], img[1][self.in_indexer.get_col_imgurlbak()], self.push_back)) nb_imgs_dl += 1 else: msg = "[{}: warning] No buffer and no URL/path for image {} !" print(msg.format(self.pp, img[0])) continue # At this point we can delete rows_batch del rows_batch gc.collect() # Download missing images nb_dl = 0 nb_dl_failed = 0 if nb_imgs_dl > 0: threads_dl = [] for i in range(min(self.nb_threads, nb_imgs_dl)): # should read (url, obj_pos) from self.q_in # and push (url, obj_pos, buffer, img_info, start_process, end_process) to self.q_out thread = ThreadedDownloaderBufferOnly( q_in_dl, q_out_dl, url_input=self.url_input) thread.start() threads_dl.append(thread) q_in_dl.join() # Push downloaded images to list_in too while nb_dl < nb_imgs_dl: # This can block? #sha1, buffer, push_back, inst = q_out_dl.get() try: sha1, buffer, push_back, inst = q_out_dl.get( True, 10) except Exception as queue_err: msg = "[{}: error] Download queue out timed out: {}" print(msg.format(self.pp, queue_err)) break nb_dl += 1 if inst: if self.verbose > 6: msg = "[{}: log] Could not download image {}, error was: {}" print(msg.format(self.pp, sha1, inst)) nb_dl_failed += 1 else: if buffer: list_in.append((sha1, buffer, push_back)) else: # Is that even possible? msg = "[{}: error] No error but no buffer either for image {}" print(msg.format(self.pp, sha1)) get_buffer_time = time.time() - start_get_buffer buff_list_size = len(list_in) msg = "[{}] Got {}/{} image buffers ({}/{} downloaded) for update {} in {}s." print( msg.format(self.pp, buff_list_size, img_list_size, nb_dl - nb_dl_failed, nb_dl, update_id, get_buffer_time)) sys.stdout.flush() # -------- # if buff_list_size == 0, we shouldn't try to process anything, just mark update as processed if buff_list_size != 0: # TODO: define a get_features method # -------- q_batch_size = int( math.ceil(float(buff_list_size) / self.nb_threads)) for i, q_batch in enumerate( build_batch(list_in, q_batch_size)): self.q_in[i].put(q_batch) # At this point we can delete list_in del list_in gc.collect() q_in_size = [] q_in_size_tot = 0 for i in range(self.nb_threads): q_in_size.append(self.q_in[i].qsize()) q_in_size_tot += q_in_size[i] if self.verbose > 5: print("[{}] Total input queues sizes is: {}".format( self.pp, q_in_size_tot)) # Start daemons... thread_creation_failed = [0] * self.nb_threads for i in range(self.nb_threads): # one per non empty input queue if q_in_size[i] > 0: try: thread = DaemonBatchExtractor( self.extractors[i], self.q_in[i], self.q_out[i], verbose=self.verbose) # Could get a 'Cannot allocate memory' if we are using too many threads... thread.start() threads.append(thread) except OSError as inst: # Should we try to push self.q_in[i] data to some other thread? msg = "[{}.process_batch: error] Could not start thread #{}: {}" print(msg.format(self.pp, i + 1, inst)) thread_creation_failed[i] = 1 time.sleep(sum(thread_creation_failed)) if sum(thread_creation_failed) == self.nb_threads: # We are in trouble... raise RuntimeError("Could not start any thread...") nb_threads_running = len(threads) deleted_extr = [0] * nb_threads_running start_process = time.time() stop = time.time() + self.max_proc_time # Wait for all tasks to be marked as done threads_finished = [0] * nb_threads_running thread_msg = "[{}] Thread {}/{} (pid: {}) " while sum(threads_finished) < nb_threads_running: for i in range(nb_threads_running): if sum(threads_finished) == nb_threads_running: sys.stdout.flush() break if threads_finished[i] == 1: continue i_q_in = i + sum(thread_creation_failed[:i + 1]) if q_in_size[i_q_in] > 0: # This seems to block forever sometimes, if subprocess crashed?... #self.q_in[i].join() # Manual join with timeout... # https://github.com/python/cpython/blob/3.6/Lib/multiprocessing/queues.py if not self.q_in[ i_q_in]._unfinished_tasks._semlock._is_zero( ) and time.time() < stop: time.sleep(1) else: if self.q_in[ i_q_in]._unfinished_tasks._semlock._is_zero( ): if self.verbose > 5: msg = thread_msg + "marked as finished because processing seems finished" print( msg.format( self.pp, i + 1, nb_threads_running, threads[i].pid)) else: # Try to stop processing threads[i].killed = True if self.verbose > 0: # In this cases does this happen... msg = thread_msg + "killed as max_proc_time ({}) has passed." print( msg.format( self.pp, i + 1, nb_threads_running, threads[i].pid, self.max_proc_time)) sys.stdout.flush() # Try to delete corresponding extractor to free memory? # And reduce number of threads at the end of the loop try: # This can block? #self.q_in[i_q_in].task_done() if deleted_extr[i] == 0: # we pushed the extractor as self.extractors[i] in a loop of self.nb_threads # we use i_q_in #del self.extractors[i_q_in] deleted_extr[i] = 1 del self.extractors[ i_q_in - sum(deleted_extr[:i + 1])] #del threads[i - sum(deleted_extr[:i+1])] except Exception: pass threads_finished[i] = 1 else: if self.verbose > 2: # We actually never gave something to process... msg = thread_msg + "marked as finished because no data was passed to it" print( msg.format(self.pp, i + 1, nb_threads_running, threads[i].pid)) threads_finished[i] = 1 # Cleanup threads to free memory before getting data back # Daemon may still be running... # and will actually be deleted only when they exit after not getting a batch del threads # Gather results q_out_size = [] q_out_size_tot = 0 for i in range(self.nb_threads): q_out_size.append(self.q_out[i].qsize()) q_out_size_tot += q_out_size[i] if self.verbose > 5: print( "[{}: log] Total output queues size is: {}".format( self.pp, q_out_size_tot)) sys.stdout.flush() # Can get stuck here? dict_imgs = dict() if q_out_size_tot > 0: for i in range(self.nb_threads): if self.verbose > 4: print("[{}] Thread {} q_out_size: {}".format( self.pp, i + 1, q_out_size[i])) sys.stdout.flush() while q_out_size[i] > 0 and not self.q_out[ i].empty(): if self.verbose > 6: print("[{}] Thread {} q_out is not empty.". format(self.pp, i + 1)) sys.stdout.flush() try: # This can still block forever? #batch_out = self.q_out[i].get(True, 10) batch_out = self.q_out[i].get_nowait() if self.verbose > 4: msg = "[{}] Got batch of {} features from thread {} q_out." print( msg.format(self.pp, len(batch_out), i + 1)) sys.stdout.flush() for sha1, dict_out in batch_out: dict_imgs[sha1] = dict_out except: if self.verbose > 1: print( "[{}] Thread {} failed to get from q_out: {}" .format(self.pp, i + 1)) sys.stdout.flush() #pass if self.verbose > 4: print( "[{}] Marking task done in q_out of thread {}." .format(self.pp, i + 1)) sys.stdout.flush() self.q_out[i].task_done() #if self.verbose > 0: print_msg = "[{}] Got features for {}/{} images in {}s." proc_time = time.time() - start_process print( print_msg.format(self.pp, len(dict_imgs.keys()), buff_list_size, proc_time)) sys.stdout.flush() # -------- if len(dict_imgs.keys()) > 0: # Push computed features self.out_indexer.push_dict_rows( dict_rows=dict_imgs, table_name=self.out_indexer.table_sha1infos_name) else: msg = "[{}: Warning] Could not get any image buffer (out of {} requested) for update {}" print(msg.format(self.pp, img_list_size, update_id)) dict_imgs = dict() nb_threads_running = len(threads) thread_creation_failed = [0] * self.nb_threads deleted_extr = [0] * nb_threads_running # Mark batch as processed now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S') # DONE: use out_indexer update_processed_dict = { update_id: { self.out_indexer.get_col_upproc(): now_str } } self.out_indexer.push_dict_rows( dict_rows=update_processed_dict, table_name=self.out_indexer.table_updateinfos_name) # Mark as completed if all rows had an extraction if img_list_size == len(dict_imgs.keys()): # DONE: use out_indexer update_completed_dict = { update_id: { self.out_indexer.get_col_upcomp(): str(1) } } self.out_indexer.push_dict_rows( dict_rows=update_completed_dict, table_name=self.out_indexer.table_updateinfos_name) # Cleanup del self.q_in del self.q_out # To try to adjust a too optimistic nb_threads setting # if (sum(thread_creation_failed) > 0 or sum(deleted_extr) > 0) and self.nb_threads > 2: # self.nb_threads -= 1 msg = "[{}] Completed update {} in {}s." print( msg.format(self.pp, update_id, time.time() - start_update)) sys.stdout.flush() self.nb_err = 0 # Force garbage collection gc.collect() # Should we just raise an Exception and restart clean? if sum(thread_creation_failed) > 0 or sum(deleted_extr) > 0: # To try to adjust a too optimistic nb_threads setting if self.nb_threads > 1: self.nb_threads -= 1 self.extractors = [] gc.collect() else: raise RuntimeError( "Processed failed with a single thread...") except Exception as inst: #exc_type, exc_obj, exc_tb = sys.exc_info() #fulltb = traceback.format_tb(exc_tb) print("[{}.process_batch: ERROR] {}".format(self.pp, inst)) #print("[{}] {} ({})".format(self.pp, inst, ''.join(fulltb))) # Things are likely to be very bad at that point... Docker should be restarted #if self.nb_threads == 2: raise inst #raise type(inst)(" {} ({})".format(inst, ''.join(fulltb))) def run(self): """Run processor """ self.nb_empt = 0 self.nb_err = 0 while True: self.process_batch() msg = "[ExtractionProcessor: log] Nothing to process at: {}" print(msg.format(datetime.now().strftime('%Y-%m-%d:%H.%M.%S'))) sys.stdout.flush() time.sleep(10 * min(self.nb_empt, 60)) self.nb_empt += 1
def __init__(self, global_conf, prefix=DEFAULT_EXTR_CHECK_PREFIX, pid=None): """ExtractionChecker constructor :param global_conf_in: configuration file or dictionary :type global_conf_in: str, dict :param prefix: prefix in configuration :type prefix: str :param pid: process id :type pid: int """ self.list_extr_prefix = [] self.pid = pid self.dict_sha1_infos = dict() super(ExtractionChecker, self).__init__(global_conf, prefix) self.last_push = time.time() self.nb_imgs_check = 0 self.nb_imgs_unproc = 0 self.nb_imgs_unproc_lastprint = 0 self.featurizer_type = self.get_required_param("featurizer_type") self.detector_type = self.get_required_param("detector_type") self.input_type = self.get_required_param("input_type") # Max delay self.max_delay = int(self.get_param("max_delay", default=3600)) self.list_extr_prefix = [self.featurizer_type, "feat", self.detector_type, self.input_type] self.extr_prefix = "_".join(self.list_extr_prefix) self.batch_check_column = None self.check_columns = [] # changed to: get column family from indexer in set_check_columns # Need to be build from extraction type and detection input + "_processed" #self.extr_family_column = self.get_param("extr_family_column", default="ext") # self.extr_prefix_base_column_name = self.extr_family_column + ":" + self.extr_prefix # self.extr_check_column = self.extr_prefix_base_column_name + "_processed" # # Need to be build from extraction type and extraction input + "_batchid" # self.batch_check_column = self.extr_prefix_base_column_name + "_updateid" # self.check_columns = [self.extr_check_column, self.batch_check_column] self.set_pp() # Initialize indexer self.indexer = HBaseIndexerMinimal(self.global_conf, prefix=self.get_required_param("indexer_prefix")) self.indexer.pp = "CheckerHBase" print(self.get_required_param("indexer_prefix"), self.indexer.get_dictcf_sha1_table()) self.set_check_columns() print(self.check_columns) # Initialize ingester try: self.ingester = GenericKafkaProcessor(self.global_conf, prefix=self.get_required_param("check_ingester_prefix")) except Exception as inst: # print "Could not initialize checker, sleeping for {}s.".format(self.max_delay) # time.sleep(self.max_delay) # raise(inst) #print("Could not initialize 'updates_out_topic' ({}). Will write only to HBase.".format(inst)) print("[{}: ERROR] Could not start ingester.".format(self.pp, inst)) raise inst # This will not be set for HBase processing, but checker would keep dying here... self.updates_out_topic = None try: self.updates_out_topic = self.ingester.get_required_param("producer_updates_out_topic") except Exception as inst: # print "Could not initialize checker, sleeping for {}s.".format(self.max_delay) # time.sleep(self.max_delay) # raise(inst) #print("Could not initialize 'updates_out_topic' ({}). Will write only to HBase.".format(inst)) print("{}. Will write only to HBase.".format(inst)) self.ingester.pp = "ec" if self.pid: self.ingester.pp += str(self.pid)
class ExtractionChecker(ConfReader): """ExtractionChecker class """ def __init__(self, global_conf, prefix=DEFAULT_EXTR_CHECK_PREFIX, pid=None): """ExtractionChecker constructor :param global_conf_in: configuration file or dictionary :type global_conf_in: str, dict :param prefix: prefix in configuration :type prefix: str :param pid: process id :type pid: int """ self.list_extr_prefix = [] self.pid = pid self.dict_sha1_infos = dict() super(ExtractionChecker, self).__init__(global_conf, prefix) self.last_push = time.time() self.nb_imgs_check = 0 self.nb_imgs_unproc = 0 self.nb_imgs_unproc_lastprint = 0 self.featurizer_type = self.get_required_param("featurizer_type") self.detector_type = self.get_required_param("detector_type") self.input_type = self.get_required_param("input_type") # Max delay self.max_delay = int(self.get_param("max_delay", default=3600)) self.list_extr_prefix = [self.featurizer_type, "feat", self.detector_type, self.input_type] self.extr_prefix = "_".join(self.list_extr_prefix) self.batch_check_column = None self.check_columns = [] # changed to: get column family from indexer in set_check_columns # Need to be build from extraction type and detection input + "_processed" #self.extr_family_column = self.get_param("extr_family_column", default="ext") # self.extr_prefix_base_column_name = self.extr_family_column + ":" + self.extr_prefix # self.extr_check_column = self.extr_prefix_base_column_name + "_processed" # # Need to be build from extraction type and extraction input + "_batchid" # self.batch_check_column = self.extr_prefix_base_column_name + "_updateid" # self.check_columns = [self.extr_check_column, self.batch_check_column] self.set_pp() # Initialize indexer self.indexer = HBaseIndexerMinimal(self.global_conf, prefix=self.get_required_param("indexer_prefix")) self.indexer.pp = "CheckerHBase" print(self.get_required_param("indexer_prefix"), self.indexer.get_dictcf_sha1_table()) self.set_check_columns() print(self.check_columns) # Initialize ingester try: self.ingester = GenericKafkaProcessor(self.global_conf, prefix=self.get_required_param("check_ingester_prefix")) except Exception as inst: # print "Could not initialize checker, sleeping for {}s.".format(self.max_delay) # time.sleep(self.max_delay) # raise(inst) #print("Could not initialize 'updates_out_topic' ({}). Will write only to HBase.".format(inst)) print("[{}: ERROR] Could not start ingester.".format(self.pp, inst)) raise inst # This will not be set for HBase processing, but checker would keep dying here... self.updates_out_topic = None try: self.updates_out_topic = self.ingester.get_required_param("producer_updates_out_topic") except Exception as inst: # print "Could not initialize checker, sleeping for {}s.".format(self.max_delay) # time.sleep(self.max_delay) # raise(inst) #print("Could not initialize 'updates_out_topic' ({}). Will write only to HBase.".format(inst)) print("{}. Will write only to HBase.".format(inst)) self.ingester.pp = "ec" if self.pid: self.ingester.pp += str(self.pid) def set_check_columns(self): """Set columns to be checked in indexer """ # changed to: get column family from indexer extr_prefix_base_column_name = self.indexer.extrcf + ":" + self.extr_prefix extr_check_column = extr_prefix_base_column_name + "_processed" # Need to be build from extraction type and extraction input + "_batchid" self.batch_check_column = extr_prefix_base_column_name + "_updateid" self.check_columns = [extr_check_column, self.batch_check_column] #print(self.check_columns) def set_pp(self, pp=""): """Set pretty name """ self.pp = "ExtractionChecker" self.pp += "-".join(self.list_extr_prefix) if self.pid: self.pp += "." + str(self.pid) def store_img_infos(self, msg): """Store information about the images of ``msg`` in ``self.dict_sha1_infos`` :param msg: Kafka record :type msg: collections.namedtuple """ # msg is technically a ConsumerRecord that is a collections.namedtuple, see: # https://github.com/dpkp/kafka-python/blob/master/kafka/consumer/fetcher.py#L30 strk = str(msg['sha1']) self.dict_sha1_infos[strk] = dict() for key in msg: # dumps json of 'img_info' # We actually need that only for DIG... if key == "img_info": self.dict_sha1_infos[strk][key] = json.dumps(msg[key]) else: # discard 'img_buffer' (if it exists?...), and 'sha1' # if k != "img_buffer" and k != "sha1": # self.dict_sha1_infos[strk][k] = msg[k] # discard 'sha1' if key != "sha1": self.dict_sha1_infos[strk][key] = msg[key] def cleanup_dict_infos(self, list_del_sha1s): """Remove images ``list_del_sha1s`` from ``self.dict_sha1_infos`` :param list_del_sha1s: list of images sha1 to remove :type list_del_sha1s: list """ for sha1 in list_del_sha1s: try: del self.dict_sha1_infos[str(sha1)] except: # could happen when cleaning up duplicates or image processed by another process pass def get_dict_push(self, list_get_sha1s, daemon=False): """Get dictionary to be pushed to HBase for images in ``list_get_sha1s`` :param list_get_sha1s: list of images :type list_get_sha1s: list :param daemon: whether the checker is running in daemon mode :type daemon: bool :return: (dict_push, update_id) :rtype: tuple """ #TODO: is this needed for every get_dict_push call? self.set_check_columns() # TODO: also pass current update_id, and move the creation of update id out of this method # this method should actually be used to 'claim' an image as soon as we can. dict_push = dict() # append processid to 'update_id' for safe use with multiple consumers, even after restart # /!\ beware, it should not contain underscores tmp_update_id, _ = self.indexer.get_next_update_id(today=None, extr_type=self.extr_prefix) update_id = tmp_update_id + '-' + self.ingester.pp + '-' + str(time.time()) for sha1 in list_get_sha1s: dict_push[str(sha1)] = dict() try: tmp_dict = self.dict_sha1_infos[str(sha1)] except: # This would mean the image has been marked as part of another batch by another process, # and thus deleted in a previous 'get_unprocessed_rows' call # This is also only relevant if we run on Daemon mode... # TODO: for transition we won't really have any info to push except the update_id... if daemon: del dict_push[str(sha1)] continue # build column names properly i.e. appending 'info:' for key in tmp_dict: # changed to: use column_family from indexer # But the use of 'key' here also means we rely on the input to define column name... #dict_push[str(sha1)]['info:' + key] = tmp_dict[key] dict_push[str(sha1)][self.indexer.imginfocf + ':' + key] = tmp_dict[key] dict_push[str(sha1)][self.batch_check_column] = update_id return dict_push, update_id def get_unprocessed_rows(self, list_check_sha1s): """Get the subset of the list of sha1s ``list_check_sha1s`` that have not been processed yet :param list_check_sha1s: list of images sha1 to check :type list_check_sha1s: list :return: set of unprocessed images :rtype: set """ # TODO: also pass current update_id and only delete if != from current update... unprocessed_rows = set(list_check_sha1s) if list_check_sha1s: # Check if the selected sha1 rows in HBase table 'sha1infos' have those check_column # This call will only return rows that DO have those check_column fam = self.indexer.get_dictcf_sha1_table() try: sha1s_rows = self.indexer.get_columns_from_sha1_rows(list_check_sha1s, self.check_columns, families=fam) except Exception as inst: print("[{}.get_unprocessed_rows: log] fam: {}".format(self.pp, fam)) raise inst #families=self.tablesha1_col_families) if sha1s_rows: # TODO: only delete if really previously processed, i.e. if != from current update... found_sha1_rows = set([str(row[0]) for row in sha1s_rows]) # Clean up 'dict_sha1_infos' deleting found_sha1_rows self.cleanup_dict_infos(found_sha1_rows) set_list_check_sha1s = set(list_check_sha1s) # TODO: but we should not re-add them, so we should discard them from unprocessed_rows unprocessed_rows = set_list_check_sha1s - found_sha1_rows return unprocessed_rows def run(self, daemon=False): """Run extraction checker :param daemon: whether we are running in daemon mode :type daemon: bool :raises Exception: if check fails """ i = 0 try: list_sha1s_to_process = [] # TODO: create update_id here while True: list_check_sha1s = [] try: # Accumulate images infos for msg_json in self.ingester.consumer: msg = json.loads(msg_json.value) # i += 1 # print((i, len(list_check_sha1s), msg)) # msg could now contain keys 'sha1' or 'list_sha1s' # should we check that we can't have both or other keys?... if 'sha1' in msg: list_check_sha1s.append(str(msg['sha1'])) # Store other fields to be able to push them too self.store_img_infos(msg) elif 'list_sha1s' in msg: for sha1 in msg['list_sha1s']: list_check_sha1s.append(str(sha1)) # We won't have any additional infos no? # But we should still build a dict for each sample for consistency... tmp_dict = dict() tmp_dict['sha1'] = str(sha1) # will basically push an empty dict to self.dict_sha1_infos, so self.get_dict_push # works properly later on... self.store_img_infos(tmp_dict) else: print('Unknown keys in msg: {}'.format(msg.keys())) if len(list_check_sha1s) >= self.indexer.batch_update_size: break except Exception as inst: # trying to use 'consumer_timeout_ms' to raise timeout and get last samples msg = "[{}: warning] At {}, caught {} {} in consumer loop" now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S') print(msg.format(self.pp, now_str, type(inst), inst)) sys.stdout.flush() if not list_check_sha1s: # TODO: should we fallback to scanning Hbase table here? continue # Check which images have not been processed (or pushed in an update) yet unprocessed_rows = self.get_unprocessed_rows(list_check_sha1s) self.nb_imgs_check += len(list_check_sha1s) push_delay = (time.time() - self.last_push) > self.max_delay / 60 if push_delay and self.nb_imgs_unproc_lastprint != self.nb_imgs_unproc: msg = "[{}: log] Found {}/{} unprocessed images" print(msg.format(self.pp, self.nb_imgs_unproc, self.nb_imgs_check)) self.nb_imgs_unproc_lastprint = self.nb_imgs_unproc # TODO: we should mark those images as being 'owned' by the update we are constructing # (only important if we are running multiple threads i.e. daemon is True) # otherwise another update running at the same time could also claim it (in another ad) # could be handle when adding data to the searcher but duplicates in extraction process... # Push sha1s to be processed for sha1 in unprocessed_rows: list_sha1s_to_process.append(sha1) # Remove potential duplicates list_sha1s_to_process = list(set(list_sha1s_to_process)) if list_sha1s_to_process: # Push them to HBase by batch of 'batch_update_size' push_delay = (time.time() - self.last_push) > self.max_delay full_batch = len(list_sha1s_to_process) >= self.indexer.batch_update_size if full_batch or (push_delay and list_sha1s_to_process): # Trim here to push exactly a batch of 'batch_update_size' list_push = list_sha1s_to_process[:min(self.indexer.batch_update_size, len(list_sha1s_to_process))] # TODO: this should be done before, # to 'claim' the images as soon as we plan to process them for this update # Gather corresponding sha1 infos dict_push, update_id = self.get_dict_push(list_push, daemon=daemon) if dict_push: self.nb_imgs_unproc += len(dict_push.keys()) msg = "[{}: at {}] Pushing update {} of {} images." now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S') print(msg.format(self.pp, now_str, update_id, len(dict_push.keys()))) sys.stdout.flush() # Push images fam = self.indexer.get_dictcf_sha1_table() if self.verbose > 4: msg = "[{}] Pushing images for update {} with fam {}" print(msg.format(self.pp, update_id, fam)) sha1s_table = self.indexer.table_sha1infos_name self.indexer.push_dict_rows(dict_push, sha1s_table, families=fam) # Build HBase updates dict dict_updates_db = dict() now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S') list_sha1s_col = self.indexer.get_col_listsha1s() dict_updates_db[update_id] = {list_sha1s_col: ','.join(dict_push.keys()), self.indexer.get_col_upcreate(): now_str} # Push it fam = self.indexer.get_dictcf_update_table() if self.verbose > 4: msg = "[{}] Pushing update {} info with fam {}" print(msg.format(self.pp, update_id, fam)) self.indexer.push_dict_rows(dict_updates_db, self.indexer.table_updateinfos_name, families=fam) # Build HBase updates dict if self.updates_out_topic is not None: dict_updates_kafka = dict() dict_updates_kafka[update_id] = ','.join(dict_push.keys()) # Push it self.ingester.producer.send(self.updates_out_topic, json.dumps(dict_updates_kafka)) # Gather any remaining sha1s and clean up infos if len(list_sha1s_to_process) > self.indexer.batch_update_size: list_sha1s_to_process = list_sha1s_to_process[self.indexer.batch_update_size:] else: list_sha1s_to_process = [] # if duplicates wrt list_push, remove them. Can this still happen? list_sha1s_to_process = [sh1 for sh1 in list_sha1s_to_process if sh1 not in list_push] self.cleanup_dict_infos(list_push) else: msg = "[{}: at {}] Nothing to push for update {}" print(msg.format(self.pp, datetime.now().strftime('%Y-%m-%d:%H.%M.%S'), update_id)) sys.stdout.flush() self.last_push = time.time() # TODO: we should create a new update_id here, # and let it claim the potential remaining images in 'list_sha1s_to_process' # sanity check that len(list_sha1s_to_process) == len(self.dict_sha1_infos) ? except Exception as inst: exc_type, exc_obj, exc_tb = sys.exc_info() fulltb = traceback.format_tb(exc_tb) raise type(inst)(" {} ({})".format(inst, ''.join(fulltb)))
def __init__(self, global_conf, prefix=DEFAULT_EXTR_CHECK_PREFIX, pid=None): """ExtractionChecker constructor :param global_conf_in: configuration file or dictionary :type global_conf_in: str, dict :param prefix: prefix in configuration :type prefix: str :param pid: process id :type pid: int """ self.list_extr_prefix = [] self.pid = pid self.dict_sha1_infos = dict() super(ExtractionChecker, self).__init__(global_conf, prefix) self.last_push = time.time() self.nb_imgs_check = 0 self.nb_imgs_unproc = 0 self.nb_imgs_unproc_lastprint = 0 self.featurizer_type = self.get_required_param("featurizer_type") self.detector_type = self.get_required_param("detector_type") self.input_type = self.get_required_param("input_type") # Max delay self.max_delay = int( self.get_param("max_delay", default=DEFAULT_MAX_DELAY)) self.min_len_check = int( self.get_param("min_len_check", default=DEFAULT_MIN_LENGTH_CHECK)) self.list_extr_prefix = [ self.featurizer_type, "feat", self.detector_type, self.input_type ] self.extr_prefix = "_".join(self.list_extr_prefix) self.batch_check_column = None self.check_columns = [] # changed to: get column family from indexer in set_check_columns # Need to be build from extraction type and detection input + "_processed" #self.extr_family_column = self.get_param("extr_family_column", default="ext") # self.extr_prefix_base_column_name = self.extr_family_column + ":" + self.extr_prefix # self.extr_check_column = self.extr_prefix_base_column_name + "_processed" # # Need to be build from extraction type and extraction input + "_batchid" # self.batch_check_column = self.extr_prefix_base_column_name + "_updateid" # self.check_columns = [self.extr_check_column, self.batch_check_column] self.set_pp() # Initialize indexer self.indexer = HBaseIndexerMinimal( self.global_conf, prefix=self.get_required_param("indexer_prefix")) self.indexer.pp = "CheckerHBase" print(self.get_required_param("indexer_prefix"), self.indexer.get_dictcf_sha1_table()) self.set_check_columns() print(self.check_columns) # Initialize ingester, that could now be Kafka or Kinesis. Should we have a default? ingester_type = self.get_required_param("image_ingestion_type") try: if ingester_type == "kafka": self.ingester = KafkaIngester( self.global_conf, prefix=self.get_required_param("check_ingester_prefix")) elif ingester_type == "kinesis": self.ingester = KinesisIngester( self.global_conf, prefix=self.get_required_param("check_ingester_prefix")) else: raise ValueError( "Unknown 'ingester_type': {}".format(ingester_type)) except Exception as inst: # print "Could not initialize checker, sleeping for {}s.".format(self.max_delay) # time.sleep(self.max_delay) # raise(inst) #print("Could not initialize 'updates_out_topic' ({}). Will write only to HBase.".format(inst)) print("[{}: ERROR] Could not start ingester.".format( self.pp, inst)) raise inst # Initialize producer # TODO: also check for 'update_ingestion_type' as producer_type? producer_type = self.get_param("update_ingestion_type", DEFAULT_UPDATE_INGESTION_TYPE) # TODO: create a producer if 'update_ingestion_type' is Kinesis or Kafka # if producer_type != "hbase": # self.updates_out_topic = self.ingester.get_required_param("producer_updates_out_topic") if producer_type == "kafka": self.pusher = KafkaPusher( self.global_conf, prefix=self.get_required_param("check_ingester_prefix")) elif producer_type == "kinesis": self.pusher = KinesisPusher( self.global_conf, prefix=self.get_required_param("check_ingester_prefix")) elif producer_type == "hbase": self.pusher = None print("[{}: log] Will write updates only to HBase.".format( self.pp)) else: raise ValueError( "Unknown 'producer_type': {}".format(producer_type))
class ExtractionChecker(ConfReader): """ExtractionChecker class """ def __init__(self, global_conf, prefix=DEFAULT_EXTR_CHECK_PREFIX, pid=None): """ExtractionChecker constructor :param global_conf_in: configuration file or dictionary :type global_conf_in: str, dict :param prefix: prefix in configuration :type prefix: str :param pid: process id :type pid: int """ self.list_extr_prefix = [] self.pid = pid self.dict_sha1_infos = dict() super(ExtractionChecker, self).__init__(global_conf, prefix) self.last_push = time.time() self.nb_imgs_check = 0 self.nb_imgs_unproc = 0 self.nb_imgs_unproc_lastprint = 0 self.featurizer_type = self.get_required_param("featurizer_type") self.detector_type = self.get_required_param("detector_type") self.input_type = self.get_required_param("input_type") # Max delay self.max_delay = int( self.get_param("max_delay", default=DEFAULT_MAX_DELAY)) self.min_len_check = int( self.get_param("min_len_check", default=DEFAULT_MIN_LENGTH_CHECK)) self.list_extr_prefix = [ self.featurizer_type, "feat", self.detector_type, self.input_type ] self.extr_prefix = "_".join(self.list_extr_prefix) self.batch_check_column = None self.check_columns = [] # changed to: get column family from indexer in set_check_columns # Need to be build from extraction type and detection input + "_processed" #self.extr_family_column = self.get_param("extr_family_column", default="ext") # self.extr_prefix_base_column_name = self.extr_family_column + ":" + self.extr_prefix # self.extr_check_column = self.extr_prefix_base_column_name + "_processed" # # Need to be build from extraction type and extraction input + "_batchid" # self.batch_check_column = self.extr_prefix_base_column_name + "_updateid" # self.check_columns = [self.extr_check_column, self.batch_check_column] self.set_pp() # Initialize indexer self.indexer = HBaseIndexerMinimal( self.global_conf, prefix=self.get_required_param("indexer_prefix")) self.indexer.pp = "CheckerHBase" print(self.get_required_param("indexer_prefix"), self.indexer.get_dictcf_sha1_table()) self.set_check_columns() print(self.check_columns) # Initialize ingester, that could now be Kafka or Kinesis. Should we have a default? ingester_type = self.get_required_param("image_ingestion_type") try: if ingester_type == "kafka": self.ingester = KafkaIngester( self.global_conf, prefix=self.get_required_param("check_ingester_prefix")) elif ingester_type == "kinesis": self.ingester = KinesisIngester( self.global_conf, prefix=self.get_required_param("check_ingester_prefix")) else: raise ValueError( "Unknown 'ingester_type': {}".format(ingester_type)) except Exception as inst: # print "Could not initialize checker, sleeping for {}s.".format(self.max_delay) # time.sleep(self.max_delay) # raise(inst) #print("Could not initialize 'updates_out_topic' ({}). Will write only to HBase.".format(inst)) print("[{}: ERROR] Could not start ingester.".format( self.pp, inst)) raise inst # Initialize producer # TODO: also check for 'update_ingestion_type' as producer_type? producer_type = self.get_param("update_ingestion_type", DEFAULT_UPDATE_INGESTION_TYPE) # TODO: create a producer if 'update_ingestion_type' is Kinesis or Kafka # if producer_type != "hbase": # self.updates_out_topic = self.ingester.get_required_param("producer_updates_out_topic") if producer_type == "kafka": self.pusher = KafkaPusher( self.global_conf, prefix=self.get_required_param("check_ingester_prefix")) elif producer_type == "kinesis": self.pusher = KinesisPusher( self.global_conf, prefix=self.get_required_param("check_ingester_prefix")) elif producer_type == "hbase": self.pusher = None print("[{}: log] Will write updates only to HBase.".format( self.pp)) else: raise ValueError( "Unknown 'producer_type': {}".format(producer_type)) #self.ingester.pp = self.get_param("pp", "ImageIngester") # Only if daemon mode, as we may have multiple ingesters # But for Kinesis the `shard_infos_filename` may not be re-used... #if self.pid: # self.ingester.pp += str(self.pid) def set_check_columns(self): """Set columns to be checked in indexer """ # changed to: get column family from indexer # TODO: get the suffixes as global variables maybe from common.defaults extr_prefix_base_column_name = self.indexer.extrcf + ":" + self.extr_prefix extr_check_column = extr_prefix_base_column_name + "_processed" # Need to be build from extraction type and extraction input + "_batchid" self.batch_check_column = extr_prefix_base_column_name + "_updateid" self.check_columns = [extr_check_column, self.batch_check_column] #print(self.check_columns) def set_pp(self, pp=""): """Set pretty name """ self.pp = "ExtractionChecker" self.pp += "-".join(self.list_extr_prefix) if self.pid: self.pp += "." + str(self.pid) def store_img_infos(self, msg): """Store information about the images of ``msg`` in ``self.dict_sha1_infos`` :param msg: message :type msg: dict """ strk = str(msg['sha1']).upper() self.dict_sha1_infos[strk] = dict() for key in msg: # dumps json of 'img_info' # We actually need that only for DIG... if key == "img_info": self.dict_sha1_infos[strk][key] = json.dumps(msg[key]) else: # discard 'img_buffer' (if it exists?...), and 'sha1' # if k != "img_buffer" and k != "sha1": # self.dict_sha1_infos[strk][k] = msg[k] # discard 'sha1' if key != "sha1": self.dict_sha1_infos[strk][key] = msg[key] def cleanup_dict_infos(self, list_del_sha1s): """Remove images ``list_del_sha1s`` from ``self.dict_sha1_infos`` :param list_del_sha1s: list of images sha1 to remove :type list_del_sha1s: list """ for sha1 in list_del_sha1s: try: del self.dict_sha1_infos[str(sha1)] except: # could happen when cleaning up duplicates or image processed by another process pass def get_dict_push(self, list_get_sha1s, daemon=False): """Get dictionary to be pushed to HBase for images in ``list_get_sha1s`` :param list_get_sha1s: list of images :type list_get_sha1s: list :param daemon: whether the checker is running in daemon mode :type daemon: bool :return: (dict_push, update_id) :rtype: tuple """ #TODO: is this needed for every get_dict_push call? self.set_check_columns() # TODO: also pass current update_id, and move the creation of update id out of this method # this method should actually be used to 'claim' an image as soon as we can. dict_push = dict() # append processid to 'update_id' for safe use with multiple consumers, even after restart # /!\ beware, it should not contain underscores tmp_update_id, _ = self.indexer.get_next_update_id( today=None, extr_type=self.extr_prefix) update_id = tmp_update_id + '-' + self.ingester.pp + '-' + str( time.time()) for sha1 in list_get_sha1s: dict_push[str(sha1)] = dict() try: tmp_dict = self.dict_sha1_infos[str(sha1)] except: # This would mean the image has been marked as part of another batch by another process, # and thus deleted in a previous 'get_unprocessed_rows' call # This is also only relevant if we run on Daemon mode... # TODO: for transition we won't really have any info to push except the update_id... if daemon: del dict_push[str(sha1)] continue # build column names properly i.e. appending 'info:' for key in tmp_dict: # changed to: use column_family from indexer # But the use of 'key' here also means we rely on the input to define column name... #dict_push[str(sha1)]['info:' + key] = tmp_dict[key] dict_push[str(sha1)][self.indexer.imginfocf + ':' + key] = tmp_dict[key] dict_push[str(sha1)][self.batch_check_column] = update_id return dict_push, update_id def get_unprocessed_rows(self, list_check_sha1s): """Get the subset of the list of sha1s ``list_check_sha1s`` that have not been processed yet :param list_check_sha1s: list of images sha1 to check :type list_check_sha1s: list :return: set of unprocessed images :rtype: set """ # TODO: also pass current update_id and only delete if != from current update... unprocessed_rows = set(list_check_sha1s) if list_check_sha1s: # Check if the selected sha1 rows in HBase table 'sha1infos' have those check_column # This call will only return rows that DO have those check_column fam = self.indexer.get_dictcf_sha1_table() try: sha1s_rows = self.indexer.get_columns_from_sha1_rows( list_check_sha1s, self.check_columns, families=fam) except Exception as inst: print("[{}.get_unprocessed_rows: log] fam: {}".format( self.pp, fam)) raise inst #families=self.tablesha1_col_families) if sha1s_rows: # TODO: only delete if really previously processed, i.e. if != from current update... found_sha1_rows = set([str(row[0]) for row in sha1s_rows]) # Clean up 'dict_sha1_infos' deleting found_sha1_rows self.cleanup_dict_infos(found_sha1_rows) set_list_check_sha1s = set(list_check_sha1s) # TODO: but we should not re-add them, so we should discard them from unprocessed_rows unprocessed_rows = set_list_check_sha1s - found_sha1_rows return unprocessed_rows def run(self, daemon=False): """Run extraction checker :param daemon: whether we are running in daemon mode :type daemon: bool :raises Exception: if check fails """ # import inspect # if not inspect.isgeneratorfunction(self.ingester.get_msg_json()): # msg = "[{}: Warning] Ingester {} function `get_msg_json` is not a generator" # print(msg.format(self.pp, type(self.ingester))) try: list_sha1s_to_process = [] list_check_sha1s = [] # TODO: create update_id here if self.verbose > 1: msg = "[{}: log] Start run main loop" msg.format(self.pp) while True: try: # Accumulate images infos #while len(list_check_sha1s) < self.indexer.batch_update_size: #while len(list_check_sha1s) < self.min_len_check: for msg in self.ingester.get_msg_json(): try: # Fix if input was JSON dumped twice? if not isinstance(msg, dict): msg = json.loads(msg) # msg could now contain keys 'sha1' or 'list_sha1s' if 'sha1' in msg: list_check_sha1s.append( str(msg['sha1']).upper()) # Store other fields to be able to push them too self.store_img_infos(msg) elif 'list_sha1s' in msg: for sha1 in msg['list_sha1s']: list_check_sha1s.append(str(sha1).upper()) # We won't have any additional infos no? # But we should still build a dict for each sample for consistency... tmp_dict = dict() tmp_dict['sha1'] = str(sha1).upper() # will basically push a dict with just the sha1 to self.dict_sha1_infos, so self.get_dict_push # works properly later on... self.store_img_infos(tmp_dict) else: raise ValueError( 'Unknown keys in msg: {}'.format( msg.keys())) # This is dangerous, as it assumes the self.ingester.get_msg_json() generator # would restart from the next point... Is this the case for Kafka? prev_len = len(list_check_sha1s) list_check_sha1s = list(set(list_check_sha1s)) if len(list_check_sha1s) < prev_len: msg = "[{}: log] Removed {} duplicate from `list_check_sha1s`" print( msg.format( self.pp, prev_len - len(list_check_sha1s))) if len(list_check_sha1s ) >= self.indexer.batch_update_size: break except Exception as inst: pr_msg = "[{}: ERROR] Could not process message: {}. {}" print(pr_msg.format(self.pp, msg, inst)) except Exception as inst: pr_msg = "[{}: at {} ERROR] Caught {} {} in consumer loop" now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S') print(pr_msg.format(self.pp, now_str, type(inst), inst)) if msg is not None: print(msg) sys.stdout.flush() if self.verbose > 3: msg = "[{}: log] Gathered {} images to check so far" msg = msg.format(self.pp, len(list_check_sha1s)) msg2 = "" if len(list_check_sha1s) > 0: msg2 = " (first: {}, last: {})" msg2 = msg2.format(list_check_sha1s[0], list_check_sha1s[-1]) print(msg + msg2) # To be able to push one (non empty) update every max_delay #if not list_check_sha1s and (time.time() - self.last_push) < self.max_delay: if len(list_check_sha1s) < self.indexer.batch_update_size and ( time.time() - self.last_push) < self.max_delay: time.sleep(1) continue self.nb_imgs_check += len(list_check_sha1s) push_delay = (time.time() - self.last_push) > max( int(self.max_delay / 60), 10) if push_delay and self.nb_imgs_unproc_lastprint != self.nb_imgs_unproc: msg = "[{}: log] Pushed {} unprocessed images so far" print( msg.format(self.pp, self.nb_imgs_unproc, self.nb_imgs_check)) self.nb_imgs_unproc_lastprint = self.nb_imgs_unproc if list_check_sha1s: # Check which images have not been processed (or pushed in an update) yet # This seems slow start_check = time.time() unprocessed_rows = self.get_unprocessed_rows( list_check_sha1s) msg = "[{}: log] Found {}/{} unprocessed images in {:.2f}s" print( msg.format(self.pp, len(unprocessed_rows), len(list_check_sha1s), time.time() - start_check)) if len(unprocessed_rows) != len( list_check_sha1s) and self.verbose > 5: already_processed = list( set(list_check_sha1s) - set(unprocessed_rows)) msg = "[{}: log] Images ".format(self.pp) for ap in already_processed: msg += "{} ".format(ap) msg += "were already processed." print(msg) #unprocessed_rows = self.get_unprocessed_rows(list_check_sha1s) # TODO: we should mark those images as being 'owned' by the update we are constructing # (only important if we are running multiple threads i.e. daemon is True) # otherwise another update running at the same time could also claim it (in another ad) # could be handle when adding data to the searcher but duplicates in extraction process... # Push sha1s to be processed for sha1 in unprocessed_rows: list_sha1s_to_process.append(sha1) # Remove potential duplicates list_sha1s_to_process = list(set(list_sha1s_to_process)) list_check_sha1s = [] if list_sha1s_to_process: # Push them to HBase by batch of 'batch_update_size' push_delay = (time.time() - self.last_push) > self.max_delay full_batch = len(list_sha1s_to_process ) >= self.indexer.batch_update_size if full_batch or (push_delay and list_sha1s_to_process): # Trim here to push exactly a batch of 'batch_update_size' list_push = list_sha1s_to_process[:min( self.indexer. batch_update_size, len(list_sha1s_to_process))] # TODO: this should be done before, # to 'claim' the images as soon as we plan to process them for this update # Gather corresponding sha1 infos dict_push, update_id = self.get_dict_push( list_push, daemon=daemon) if dict_push: self.nb_imgs_unproc += len(dict_push.keys()) msg = "[{}: at {}] Pushing update {} of {} images." now_str = datetime.now().strftime( '%Y-%m-%d:%H.%M.%S') print( msg.format(self.pp, now_str, update_id, len(dict_push.keys()))) sys.stdout.flush() # Push images fam = self.indexer.get_dictcf_sha1_table() if self.verbose > 5: msg = "[{}] Pushing images for update {} with fam {}" print(msg.format(self.pp, update_id, fam)) sha1s_table = self.indexer.table_sha1infos_name self.indexer.push_dict_rows(dict_push, sha1s_table, families=fam) # Build HBase updates dict dict_updates_db = dict() now_str = datetime.now().strftime( '%Y-%m-%d:%H.%M.%S') list_sha1s_col = self.indexer.get_col_listsha1s() dict_updates_db[update_id] = { list_sha1s_col: ','.join(dict_push.keys()), self.indexer.get_col_upcreate(): now_str } # Push it fam = self.indexer.get_dictcf_update_table() if self.verbose > 5: msg = "[{}] Pushing update {} info with fam {}" print(msg.format(self.pp, update_id, fam)) self.indexer.push_dict_rows( dict_updates_db, self.indexer.table_updateinfos_name, families=fam) # Build pusher updates dict if self.pusher is not None: dict_updates_kafka = dict() dict_updates_kafka[update_id] = ','.join( dict_push.keys()) # Push it #self.ingester.producer.send(self.updates_out_topic, json.dumps(dict_updates_kafka)) #self.pusher.send(self.updates_out_topic, dict_updates_kafka) self.pusher.send(dict_updates_kafka) # Gather any remaining sha1s and clean up infos if len(list_sha1s_to_process ) > self.indexer.batch_update_size: list_sha1s_to_process = list_sha1s_to_process[ self.indexer.batch_update_size:] else: list_sha1s_to_process = [] # if duplicates wrt list_push, remove them. Can this still happen? list_sha1s_to_process = [ sh1 for sh1 in list_sha1s_to_process if sh1 not in list_push ] self.cleanup_dict_infos(list_push) else: msg = "[{}: at {}] Nothing to push for update {}" print( msg.format( self.pp, datetime.now().strftime( '%Y-%m-%d:%H.%M.%S'), update_id)) sys.stdout.flush() self.last_push = time.time() # TODO: we should create a new update_id here, # and let it claim the potential remaining images in 'list_sha1s_to_process' # sanity check that len(list_sha1s_to_process) == len(self.dict_sha1_infos) ? else: if self.verbose > 3: msg = "[{}: at {}] Gathered {} images so far..." now_str = datetime.now().strftime( '%Y-%m-%d:%H.%M.%S') print( msg.format(self.pp, now_str, len(list_sha1s_to_process))) except Exception as inst: exc_type, exc_obj, exc_tb = sys.exc_info() fulltb = traceback.format_tb(exc_tb) raise type(inst)(" {} ({})".format(inst, ''.join(fulltb)))
class ExtractionChecker(ConfReader): def __init__(self, global_conf, prefix=default_extr_check_prefix, pid=None): self.list_extr_prefix = [] self.pid = pid self.dict_sha1_infos = dict() super(ExtractionChecker, self).__init__(global_conf, prefix) self.featurizer_type = self.get_required_param("featurizer_type") self.detector_type = self.get_required_param("detector_type") self.input_type = self.get_required_param("input_type") # Need to be build from extraction type and detection input + "_processed" self.extr_family_column = "ext" tmp_extr_family_column = self.get_param("extr_family_column") if tmp_extr_family_column: self.extr_family_column = tmp_extr_family_column # Max delay self.max_delay = 3600 # self.max_delay = 600 max_delay = self.get_param("max_delay") if max_delay: self.max_delay = int(max_delay) self.last_push = time.time() self.nb_imgs_check = 0 self.nb_imgs_unproc = 0 self.nb_imgs_unproc_lastprint = 0 # Beware, the self.extr_family_column should be added to the indexer families parameter in get_create_table... # TODO: should we add the 'ad' column family too here by default self.tablesha1_col_families = {'info': dict(), self.extr_family_column: dict()} self.list_extr_prefix = [self.featurizer_type, "feat", self.detector_type, self.input_type] self.extr_prefix = "_".join(self.list_extr_prefix) self.extr_prefix_base_column_name = self.extr_family_column + ":" + self.extr_prefix self.extr_check_column = self.extr_prefix_base_column_name + "_processed" # Need to be build from extraction type and extraction input + "_batchid" self.batch_check_column = self.extr_prefix_base_column_name + "_updateid" self.check_columns = [self.extr_check_column, self.batch_check_column] self.set_pp() # Initialize indexer and ingester self.indexer = HBaseIndexerMinimal(self.global_conf, prefix=self.get_required_param("indexer_prefix")) self.ingester = GenericKafkaProcessor(self.global_conf, prefix=self.get_required_param("check_ingester_prefix")) # This will not be set for HBase processing, but checker would keep dying here... try: self.updates_out_topic = self.ingester.get_required_param("producer_updates_out_topic") except Exception as inst: print "Could not initialize checker, sleeping for {}s.".format(self.max_delay) time.sleep(self.max_delay) raise(inst) self.ingester.pp = "ec" if self.pid: self.ingester.pp += str(self.pid) def set_pp(self): self.pp = "ExtractionChecker." self.pp += "-".join(self.list_extr_prefix) if self.pid: self.pp += "." + str(self.pid) def store_img_infos(self, msg): strk = str(msg['sha1']) self.dict_sha1_infos[strk] = dict() for k in msg: # dumps json of 'img_info' if k == "img_info": self.dict_sha1_infos[strk][k] = json.dumps(msg[k]) else: # discard 'img_buffer' (if it exists?...), and 'sha1' # if k != "img_buffer" and k != "sha1": # self.dict_sha1_infos[strk][k] = msg[k] # discard 'sha1' if k != "sha1": self.dict_sha1_infos[strk][k] = msg[k] def cleanup_dict_infos(self, list_del_sha1s): for sha1 in list_del_sha1s: try: del self.dict_sha1_infos[str(sha1)] except: # could happen when cleaning up duplicates or image processed by another process pass def get_dict_push(self, list_get_sha1s): # TODO: also pass current update_id, and move the creation of update id out of this method # this method should actually be used to 'claim' an image as soon as we can. dict_push = dict() # append unique processid to 'update_id' to make it safe to use with multiple consumers, even after a restart. # /!\ beware, it should not contain underscores tmp_update_id, _ = self.indexer.get_next_update_id(today=None, extr_type=self.extr_prefix) update_id = tmp_update_id + '-' + self.ingester.pp + '-' + str(time.time()) for sha1 in list_get_sha1s: dict_push[str(sha1)] = dict() try: tmp_dict = self.dict_sha1_infos[str(sha1)] except: # This would mean the image has been marked as part of another batch by another process, # and thus deleted in a previous 'get_unprocessed_rows' call del dict_push[str(sha1)] continue # build column names properly i.e. appending 'info:' for k in tmp_dict: dict_push[str(sha1)]['info:' + k] = tmp_dict[k] dict_push[str(sha1)][self.batch_check_column] = update_id return dict_push, update_id def get_unprocessed_rows(self, list_sha1s_to_check): # TODO: also pass current update_id and only delete if != from current update... unprocessed_rows = set(list_sha1s_to_check) if list_sha1s_to_check: # Check if the selected sha1 rows in HBase table 'sha1infos' have those check_column # This call will only return rows that DO have those check_column sha1s_rows = self.indexer.get_columns_from_sha1_rows(list_sha1s_to_check, self.check_columns, families=self.tablesha1_col_families) if sha1s_rows: # TODO: only delete if really previously processed, i.e. if != from current update... found_sha1_rows = set([str(row[0]) for row in sha1s_rows]) # Clean up 'dict_sha1_infos' deleting found_sha1_rows self.cleanup_dict_infos(found_sha1_rows) set_list_sha1s_to_check = set(list_sha1s_to_check) # TODO: but we should not re-add them, so we should discard them from unprocessed_rows unprocessed_rows = set_list_sha1s_to_check - found_sha1_rows return unprocessed_rows def run(self): try: list_sha1s_to_process = [] # TODO: create update_id here while True: list_sha1s_to_check = [] try: # Accumulate images infos for msg_json in self.ingester.consumer: msg = json.loads(msg_json.value) list_sha1s_to_check.append(str(msg['sha1'])) # Store other fields to be able to push them too self.store_img_infos(msg) if len(list_sha1s_to_check) >= self.indexer.batch_update_size: break except Exception as inst: # trying to use 'consumer_timeout_ms' to raise timeout and get last samples warn_msg = "[{}: warning] At {}, caught {} {} in consumer loop" print warn_msg.format(self.pp, datetime.now().strftime('%Y-%m-%d:%H.%M.%S'), type(inst), inst) sys.stdout.flush() # Check which images have not been processed (or pushed in an update) yet unprocessed_rows = self.get_unprocessed_rows(list_sha1s_to_check) self.nb_imgs_check += len(list_sha1s_to_check) if ( time.time() - self.last_push) > self.max_delay / 60 and self.nb_imgs_unproc_lastprint != self.nb_imgs_unproc: msg_log = "[{}: log] Found {}/{} unprocessed images" print msg_log.format(self.pp, self.nb_imgs_unproc, self.nb_imgs_check) self.nb_imgs_unproc_lastprint = self.nb_imgs_unproc # TODO: we should mark those images as being 'owned' by the update we are constructing # (only reallyimportant if we are running multiple threads...) # otherwise another update running at the same time could also claim it (if it appears in another ad) # this can be handle when adding data to the searcher but induces duplicates in extraction process... # Push sha1s to be processed for sha1 in unprocessed_rows: list_sha1s_to_process.append(sha1) # Remove potential duplicates list_sha1s_to_process = list(set(list_sha1s_to_process)) if list_sha1s_to_process: # Push them to HBase by batch of 'batch_update_size' if len(list_sha1s_to_process) >= self.indexer.batch_update_size or ( (time.time() - self.last_push) > self.max_delay and len(list_sha1s_to_process) > 0): # Trim here to push exactly a batch of 'batch_update_size' list_push = list_sha1s_to_process[:min(self.indexer.batch_update_size, len(list_sha1s_to_process))] # TODO: this should be done before, to 'claim' the images as soon as we plan to process them for this update # Gather corresponding sha1 infos dict_push, update_id = self.get_dict_push(list_push) if dict_push: self.nb_imgs_unproc += len(dict_push.keys()) push_msg = "[{}: at {}] Pushing update {} of {} images." print push_msg.format(self.pp, datetime.now().strftime('%Y-%m-%d:%H.%M.%S'), update_id, len(dict_push.keys())) sys.stdout.flush() # Push images self.indexer.push_dict_rows(dict_push, self.indexer.table_sha1infos_name, families=self.tablesha1_col_families) # Build updates dict dict_updates_db = dict() dict_updates_kafka = dict() dict_updates_db[update_id] = {self.indexer.column_list_sha1s: ','.join(dict_push.keys()), 'info:' + update_str_created: datetime.now().strftime('%Y-%m-%d:%H.%M.%S')} dict_updates_kafka[update_id] = ','.join(dict_push.keys()) # Push them self.indexer.push_dict_rows(dict_updates_db, self.indexer.table_updateinfos_name) self.ingester.producer.send(self.updates_out_topic, json.dumps(dict_updates_kafka)) # Gather any remaining sha1s and clean up infos if len(list_sha1s_to_process) > self.indexer.batch_update_size: list_sha1s_to_process = list_sha1s_to_process[self.indexer.batch_update_size:] else: list_sha1s_to_process = [] # if duplicates wrt list_push, remove them. Can this still happen? list_sha1s_to_process = [sha1 for sha1 in list_sha1s_to_process if sha1 not in list_push] self.cleanup_dict_infos(list_push) else: no_push_msg = "[{}: at {}] Nothing to push for update {}" print no_push_msg.format(self.pp, datetime.now().strftime('%Y-%m-%d:%H.%M.%S'), update_id) sys.stdout.flush() self.last_push = time.time() # TODO: we should create a new update_id here, and let it claim the potential remaining images in 'list_sha1s_to_process' # sanity check that len(list_sha1s_to_process) == len(self.dict_sha1_infos) ? except Exception as inst: exc_type, exc_obj, exc_tb = sys.exc_info() fulltb = traceback.format_tb(exc_tb) raise type(inst)(" {} ({})".format(inst, ''.join(fulltb)))
from cufacesearch.indexer.hbase_indexer_minimal import HBaseIndexerMinimal from cufacesearch.detector.utils import show_bbox_from_URL import numpy as np import base64 hbim = HBaseIndexerMinimal('../conf/global_conf_test_get_face_hbase.json') list_sha1s = ['000000D29139258BD3716C94A68CFF54A8A7C033', '000001BF13372B9665A89ED25E8948FC7F99F7F1'] # TODO: use column_family and column_name from indexer rows = hbim.get_columns_from_sha1_rows(list_sha1s, ['face', 'info:s3_url']) for sha1, data in rows: print sha1, data url = data['info:s3_url'] for key in data: if key.startswith('face:'): face_bbox = key.split('face:dlib_feat_dlib_face_')[-1].split('_') feat_b64 = np.frombuffer(base64.b64decode(data[key]), dtype=np.float32) print feat_b64.shape, feat_b64 show_bbox_from_URL(url, map(int, face_bbox), close_after=1)
# This should be run in the docker from __future__ import print_function from cufacesearch.indexer.hbase_indexer_minimal import HBaseIndexerMinimal import sys start_row = '0' * 40 # Change that hbim = HBaseIndexerMinimal( '../conf/generated/conf_extraction_lfw_local_dlib.json') #hbim.get_updates_from_date() nb_face = 0 nb_image = 0 prev_row = '~' curr_row = start_row #print(curr_row) file_names = [] sha1s = [] #print('Scanning', end='', flush=True) sys.stdout.write('Scanning') sys.stdout.flush() while prev_row != curr_row + '~': prev_row = curr_row #if prev_row != start_row: prev_row += '~' #print(prev_row) #print('.', end='', flush=True) sys.stdout.write('.') sys.stdout.flush() for row in hbim.scan_from_row(hbim.table_sha1infos_name, row_start=prev_row,