class ExtractionProcessor(ConfReader):

  def __init__(self, global_conf, prefix=default_extr_proc_prefix):
    self.extractor = None
    self.nb_empt = 0
    self.nb_err = 0
    self.max_proc_time = 1200 # in seconds. Increased for sbcmdline...
    self.url_input = True

    super(ExtractionProcessor, self).__init__(global_conf, prefix)

    self.input_type = self.get_required_param("input_type")
    self.nb_threads = self.get_required_param("nb_threads")
    self.featurizer_type = self.get_required_param("featurizer_type")
    self.featurizer_prefix = self.get_required_param("featurizer_prefix")
    self.detector_type = self.get_required_param("detector_type")

    # Means we extract feature from the whole image
    if self.detector_type == "full":
      self.detector = None

    self.verbose = 0
    verbose = self.get_param("verbose")
    if verbose:
      self.verbose = int(verbose)

    self.ingestion_input = "kafka"
    ingestion_input = self.get_param("ingestion_input")
    if ingestion_input:
      self.ingestion_input = ingestion_input

    file_input = self.get_param("file_input")
    print("[{}.ExtractionProcessor: log] file_input: {}".format(self.pp, file_input))
    if file_input:
      self.url_input = False
    print("[{}.ExtractionProcessor: log] url_input: {}".format(self.pp, self.url_input))

    if self.url_input:
      self.img_column =  img_URL_column
    else:
      self.img_column = img_path_column
    print("[{}.ExtractionProcessor: log] img_column: {}".format(self.pp, self.img_column))

    # Need to be build from extraction type and detection input + "_processed"
    self.extr_family_column = "ext"
    tmp_extr_family_column = self.get_param("extr_family_column")
    if tmp_extr_family_column:
      self.extr_family_column = tmp_extr_family_column

    self.push_back = False
    push_back = self.get_param("push_back")
    if push_back:
      self.push_back = True

    self.extr_prefix = build_extr_str(self.featurizer_type, self.detector_type, self.input_type)
    self.set_pp()

    # Initialize queues
    self.init_queues()

    # Initialize extractors only once (just one first)
    self.extractors = []
    #for i in range(self.nb_threads):
    #  self.extractors.append(GenericExtractor(self.detector_type, self.featurizer_type, self.input_type,
    #                                  self.extr_family_column, self.featurizer_prefix, self.global_conf))
    self.extractors.append(GenericExtractor(self.detector_type, self.featurizer_type,
                                            self.input_type, self.extr_family_column,
                                            self.featurizer_prefix, self.global_conf))

    # Beware, the self.extr_family_column should be added to the indexer families parameter in get_create_table...
    # What if the table has some other column families?...
    self.tablesha1_col_families = {'info': dict(), self.extr_family_column: dict()}

    # Initialize indexer
    self.indexer = HBaseIndexerMinimal(self.global_conf, prefix=self.get_required_param("indexer_prefix"))
    self.last_update_date_id = ''

    # Initialize ingester
    self.ingester = GenericKafkaProcessor(self.global_conf, prefix=self.get_required_param("proc_ingester_prefix"))
    self.ingester.pp = "ep"


  def set_pp(self):
    self.pp = "ExtractionProcessor"
    if self.extractor:
      self.pp += "_"+self.extr_prefix

  def init_queues(self):
    from multiprocessing import JoinableQueue
    self.q_in = []
    self.q_out = []
    for i in range(self.nb_threads):
      self.q_in.append(JoinableQueue(0))
      self.q_out.append(JoinableQueue(0))

  def get_batch_hbase(self):
    # legacy implementation: better to have a kafka topic for batches to be processed to allow
    #       safe parallelization on different machines
    try:
      for updates in self.indexer.get_unprocessed_updates_from_date(self.last_update_date_id,
                                                                    extr_type=self.extr_prefix):
        for update_id, update_cols in updates:
          if self.extr_prefix in update_id:
            # double check update has not been processed somewhere else
            if self.is_update_unprocessed(update_id):
              # double check update was not marked as started recently i.e. by another process
              if self.is_update_notstarted(update_id, max_delay=TIME_ELAPSED_FAILED):
                list_sha1s = update_cols[column_list_sha1s].split(',')
                log_msg = "[{}.get_batch_hbase: log] Update {} has {} images."
                print(log_msg.format(self.pp, update_id, len(list_sha1s)))
                # also get 'ext:' to check if extraction was already processed?
                rows_batch = self.indexer.get_columns_from_sha1_rows(list_sha1s,
                                                                     columns=[img_buffer_column,
                                                                              self.img_column])
                # print "rows_batch", rows_batch
                if rows_batch:
                  yield rows_batch, update_id
                  self.last_update_date_id = '_'.join(update_id.split('_')[-2:])
                else:
                  log_msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}"
                  print(log_msg.format(self.pp, update_id))
              else:
                log_msg = "[{}.get_batch_hbase: log] Skipping update started recently: {}"
                print(log_msg.format(self.pp, update_id))
                continue
            else:
              log_msg = "[{}.get_batch_hbase: log] Skipping already processed update: {}"
              print(log_msg.format(self.pp, update_id))
              continue
          else:
            if self.verbose > 6:
              log_msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type."
              print(log_msg.format(self.pp, update_id))
      else:
        print("[{}.get_batch_hbase: log] No unprocessed update found.".format(self.pp))
        # Look for updates that have some unprocessed images
        # TODO: wether we do that or not could be specified by a parameter
        # as this induces slow down during update...
        for updates in self.indexer.get_missing_extr_updates_from_date("1970-01-01",
                                                                       extr_type=self.extr_prefix):
          for update_id, update_cols in updates:
            if self.extr_prefix in update_id:
              if column_list_sha1s in update_cols:
                list_sha1s = update_cols[column_list_sha1s].split(',')
                log_msg = "[{}.get_batch_hbase: log] Update {} has {} images missing extractions."
                print(log_msg.format(self.pp, update_id, len(list_sha1s)))
                sys.stdout.flush()
                # also get 'ext:' to check if extraction was already processed?
                rows_batch = self.indexer.get_columns_from_sha1_rows(list_sha1s,
                                                                     columns=[img_buffer_column,
                                                                              self.img_column])
                if rows_batch:
                  yield rows_batch, update_id
                else:
                  log_msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}"
                  print(log_msg.format(self.pp, update_id))
              else:
                log_msg = "[{}.get_batch_hbase: log] Update {} has no images list."
                print(log_msg.format(self.pp, update_id))
            else:
              log_msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type."
              print(log_msg.format(self.pp, update_id))
        else:
          log_msg = "[{}.get_batch_hbase: log] No updates with missing extractions found."
          print(log_msg.format(self.pp))
          sys.stdout.flush()

    except Exception as inst:
      full_trace_error("[{}.get_batch_hbase: error] {}".format(self.pp, inst))


  def is_update_unprocessed(self, update_id):
    update_rows = self.indexer.get_rows_by_batch([update_id],
                                                 table_name=self.indexer.table_updateinfos_name)
    if update_rows:
      for row in update_rows:
        if info_column_family+":"+update_str_processed in row[1]:
          return False
    return True

  def is_update_notstarted(self, update_id, max_delay=None):
    """Check if an update was not started yet.

    :param update_id: update id
    :param max_delay: delay (in seconds) between marked start time and now to consider update failed
    :return: boolean
    """
    update_rows = self.indexer.get_rows_by_batch([update_id],
                                                 table_name=self.indexer.table_updateinfos_name)
    if update_rows:
      for row in update_rows:
        if info_column_family+":"+update_str_started in row[1]:
          if max_delay:
            # check that started was mark recently, if not it may mean the update processing failed
            start_str = row[1][info_column_family+":"+update_str_started]
            # start time format is '%Y-%m-%d:%H.%M.%S'
            start_dt = datetime.strptime(start_str, '%Y-%m-%d:%H.%M.%S')
            now_dt = datetime.now()
            diff_dt = now_dt - start_dt
            if diff_dt.total_seconds() > max_delay:
              return True
            else:
              return False
          else:
            return False
    return True

  def get_batch_kafka(self):
    # Read from a kafka topic to allow safer parallelization on different machines
    try:
      # Needs to read topic to get update_id and list of sha1s
      for msg in self.ingester.consumer:
        msg_dict = json.loads(msg.value)
        update_id = msg_dict.keys()[0]
        # NB: Try to get update info and check it was really not processed yet.
        if self.is_update_unprocessed(update_id):
          str_list_sha1s = msg_dict[update_id]
          list_sha1s = str_list_sha1s.split(',')
          print("[{}.get_batch_kafka: log] Update {} has {} images.".format(self.pp, update_id, len(list_sha1s)))
          # NB: we could also get 'ext:' of images to double check if extraction was already processed
          #rows_batch = self.indexer.get_columns_from_sha1_rows(list_sha1s, columns=["info:img_buffer"])
          if self.verbose > 3:
            print("[{}.get_batch_kafka: log] Looking for colums: {}".format(self.pp, [img_buffer_column, self.img_column]))
          rows_batch = self.indexer.get_columns_from_sha1_rows(list_sha1s, columns=[img_buffer_column, self.img_column])
          #print "rows_batch", rows_batch
          if rows_batch:
            if self.verbose > 4:
              print("[{}.get_batch_kafka: log] Yielding for update: {}".format(self.pp, update_id))
            yield rows_batch, update_id
            self.ingester.consumer.commit()
            if self.verbose > 4:
              print("[{}.get_batch_kafka: log] After yielding for update: {}".format(self.pp, update_id))
            self.last_update_date_id = '_'.join(update_id.split('_')[-2:])
          # Should we try to commit offset only at this point?
          else:
            print("[{}.get_batch_kafka: log] Did not get any image buffers for the update: {}".format(self.pp, update_id))
        else:
          print("[{}.get_batch_kafka: log] Skipping already processed update: {}".format(self.pp, update_id))
      else:
        print("[{}.get_batch_kafka: log] No update found.".format(self.pp))
        # Fall back to checking HBase for unstarted/unfinished updates
        for rows_batch, update_id in self.get_batch_hbase():
          yield rows_batch, update_id
    except Exception as inst:
      full_trace_error("[{}.get_batch_kafka: error] {}".format(self.pp, inst))


  def get_batch(self):
    if self.ingestion_input == "hbase":
      for rows_batch, update_id in self.get_batch_hbase():
        yield rows_batch, update_id
    else:
      for rows_batch, update_id in self.get_batch_kafka():
        yield rows_batch, update_id

  def process_batch(self):
    # Get a new update batch
    for rows_batch, update_id in self.get_batch():
      try:
        start_update = time.time()
        print("[{}] Processing update {} of {} rows.".format(self.pp, update_id, len(rows_batch)))
        sys.stdout.flush()

        # Initialize
        self.nb_empt = 0
        self.init_queues()
        threads = []

        # If we deleted an extractor at some point or for first batch
        nb_extr_to_create = self.nb_threads - len(self.extractors)
        if nb_extr_to_create:
          start_create_extractor = time.time()
          while len(self.extractors) < self.nb_threads:

            self.extractors.append(GenericExtractor(self.detector_type, self.featurizer_type, self.input_type,
                                                    self.extr_family_column, self.featurizer_prefix,
                                                    self.global_conf))
          buff_msg = "[{}] Created {} extractors in {}s."
          create_extr_time = time.time() - start_create_extractor
          print(buff_msg.format(self.pp, nb_extr_to_create, create_extr_time))


        # Mark batch as started to be process
        now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S')
        dict_val = {info_column_family + ':' + update_str_started: now_str}
        update_started_dict = {update_id: dict_val}
        self.indexer.push_dict_rows(dict_rows=update_started_dict,
                                    table_name=self.indexer.table_updateinfos_name)

        # Push images to queue
        list_in = []
        # For parallelized downloading...
        from Queue import Queue
        nb_imgs_dl = 0
        q_in_dl = Queue(0)
        q_out_dl = Queue(0)

        start_get_buffer = time.time()
        for img in rows_batch:
          # should decode base64
          if img_buffer_column in img[1]:
            tup = (img[0], img[1][img_buffer_column], False)
            list_in.append(tup)
          else:
            # need to re-download, accumulate a list of URLs to download
            # Deal with img_path_column for local_images_kafka_pusher
            if self.img_column in img[1]:
              q_in_dl.put((img[0], img[1][self.img_column], self.push_back))
              nb_imgs_dl += 1
            else:
              print("[{}: warning] No buffer and no URL/path for image {} !".format(self.pp, img[0]))
              continue

        # Download missing images
        if nb_imgs_dl > 0:
          threads_dl = []
          for i in range(min(self.nb_threads, nb_imgs_dl)):
            # should read (url, obj_pos) from self.q_in
            # and push (url, obj_pos, buffer, img_info, start_process, end_process) to self.q_out
            thread = ThreadedDownloaderBufferOnly(q_in_dl, q_out_dl, url_input=self.url_input)
            thread.start()
            threads_dl.append(thread)

          q_in_dl.join()

          # Push downloaded images to list_in too
          nb_dl = 0
          while nb_dl < nb_imgs_dl:
            # This can block?
            #sha1, buffer, push_back, inst = q_out_dl.get()
            try:
              sha1, buffer, push_back, inst = q_out_dl.get(True, 10)
            except Exception as queue_err:
              warn_msg = "[{}: error] Download queue out timed out: {}"
              print(warn_msg.format(self.pp, queue_err))
              break
            nb_dl += 1
            if inst:
              if self.verbose > 0:
                log_msg = "[{}: log] Could not download image {}, error was: {}"
                print(log_msg.format(self.pp, sha1, inst))
            else:
              if buffer:
                list_in.append((sha1, buffer, push_back))
              else:
                # Is that possible?
                err_msg = "[{}: error] No error but no buffer either for image {}"
                print(err_msg.format(self.pp, sha1))

        get_buffer_time = time.time() - start_get_buffer
        buff_msg = "[{}] Got {}/{} image buffers for update {} in {}s."
        print(buff_msg.format(self.pp, len(list_in), len(rows_batch), update_id, get_buffer_time))
        sys.stdout.flush()

        q_batch_size = int(math.ceil(float(len(list_in))/self.nb_threads))
        for i, q_batch in enumerate(build_batch(list_in, q_batch_size)):
          self.q_in[i].put(q_batch)

        q_in_size = []
        q_in_size_tot = 0
        for i in range(self.nb_threads):
          q_in_size.append(self.q_in[i].qsize())
          q_in_size_tot += q_in_size[i]
        if self.verbose > 5:
          print("[{}] Total input queues sizes is: {}".format(self.pp, q_in_size_tot))

        # Start daemons...
        thread_creation_failed = [0]*self.nb_threads
        for i in range(self.nb_threads):
          # one per non empty input queue
          if q_in_size[i] > 0:
            try:
              thread = DaemonBatchExtractor(self.extractors[i], self.q_in[i], self.q_out[i], verbose=self.verbose)
              # Could get a 'Cannot allocate memory' if we are using too many threads...
              thread.start()
              threads.append(thread)
            except OSError as inst:
              # Should we try to push self.q_in[i] data to some other thread?
              print("[{}.process_batch: error] Could not start thread #{}: {}".format(self.pp, i+1, inst))
              thread_creation_failed[i] = 1
              time.sleep(10*sum(thread_creation_failed))

        if sum(thread_creation_failed) == self.nb_threads:
          raise ValueError("Could not start any thread...")

        nb_threads_running = len(threads)
        start_process = time.time()
        stop = time.time() + self.max_proc_time
        # Wait for all tasks to be marked as done
        threads_finished = [0] * nb_threads_running
        deleted_extr = [0] * nb_threads_running
        thread_msg = "[{}] Thread {}/{} (pid: {}) "
        while sum(threads_finished) < nb_threads_running:
          for i in range(nb_threads_running):
            if sum(threads_finished) == nb_threads_running:
              sys.stdout.flush()
              break
            if threads_finished[i] == 1:
              continue
            i_q_in = i + sum(thread_creation_failed[:i + 1])
            if q_in_size[i_q_in] > 0:
              # This seems to block forever sometimes, if subprocess crashed?...
              #self.q_in[i].join()
              # Manual join with timeout...
              # https://github.com/python/cpython/blob/3.6/Lib/multiprocessing/queues.py
              if not self.q_in[i_q_in]._unfinished_tasks._semlock._is_zero() and time.time() < stop:
                time.sleep(1)
              else:
                if self.q_in[i_q_in]._unfinished_tasks._semlock._is_zero():
                  if self.verbose > 5:
                    msg = thread_msg+"marked as finished because processing seems finished"
                    print(msg.format(self.pp, i+1, nb_threads_running, threads[i].pid))
                else:
                  if self.verbose > 0:
                    # In this cases does this happen...
                    msg = thread_msg+"force marked task as done as max_proc_time ({}) has passed."
                    print(msg.format(self.pp, i+1, nb_threads_running, threads[i].pid,
                                     self.max_proc_time))
                    sys.stdout.flush()
                    # Try to delete corresponding extractor to free memory?
                    # And reduce number of threads at the end of the loop
                  try:
                    self.q_in[i_q_in].task_done()
                    if deleted_extr[i] == 0:
                      # we pushed the extractor as self.extractors[i] in a loop of self.nb_threads
                      # we use i_q_in
                      del self.extractors[i_q_in]
                      deleted_extr[i] = 1
                  except Exception:
                    pass
                threads_finished[i] = 1
            else:
              if self.verbose > 2:
                # We actually never gave something to process...
                msg = thread_msg+"marked as finished because no data was passed to it"
                print(msg.format(self.pp, i+1, nb_threads_running, threads[i].pid))
              threads_finished[i] = 1

        # Cleanup threads to free memory before getting data back
        # Daemon may still be running...
        # and will actually be deleted only when they exit after not getting a batch
        del threads

        # Gather results
        q_out_size = []
        q_out_size_tot = 0
        for i in range(self.nb_threads):
          q_out_size.append(self.q_out[i].qsize())
          q_out_size_tot += q_out_size[i]

        if self.verbose > 5:
          print("[{}: log] Total output queues size is: {}".format(self.pp, q_out_size_tot))
          sys.stdout.flush()

        # Can get stuck here?
        dict_imgs = dict()
        for i in range(self.nb_threads):
          if self.verbose > 4:
            print("[{}] Thread {} q_out_size: {}".format(self.pp, i+1, q_out_size[i]))
            sys.stdout.flush()
          while q_out_size[i]>0 and not self.q_out[i].empty():
            if self.verbose > 6:
              print("[{}] Thread {} q_out is not empty.".format(self.pp, i + 1))
              sys.stdout.flush()
            try:
              batch_out = self.q_out[i].get(True, 10)
              if self.verbose > 4:
                msg = "[{}] Got batch of {} features from thread {} q_out."
                print(msg.format(self.pp, len(batch_out), i + 1))
                sys.stdout.flush()
              for sha1, dict_out in batch_out:
                dict_imgs[sha1] = dict_out
            except:
              if self.verbose > 1:
                print("[{}] Thread {} failed to get from q_out: {}".format(self.pp, i+1))
                sys.stdout.flush()
              pass
            if self.verbose > 4:
              print("[{}] Marking task done in q_out of thread {}.".format(self.pp, i + 1))
              sys.stdout.flush()
            self.q_out[i].task_done()

        #if self.verbose > 0:
        print_msg = "[{}] Got features for {}/{} images in {}s."
        proc_time = time.time() - start_process
        print(print_msg.format(self.pp, len(dict_imgs.keys()), len(list_in), proc_time))
        sys.stdout.flush()

        # Push them
        self.indexer.push_dict_rows(dict_rows=dict_imgs, table_name=self.indexer.table_sha1infos_name)

        # Mark batch as processed
        update_processed_dict = {update_id: {info_column_family + ':' + update_str_processed: datetime.now().strftime('%Y-%m-%d:%H.%M.%S')}}
        self.indexer.push_dict_rows(dict_rows=update_processed_dict, table_name=self.indexer.table_updateinfos_name)

        # Mark as completed if all rows had an extraction
        if len(rows_batch) == len(dict_imgs.keys()):
          update_completed_dict = {update_id: {info_column_family + ':' + update_str_completed: str(1)}}
          self.indexer.push_dict_rows(dict_rows=update_completed_dict,
                                      table_name=self.indexer.table_updateinfos_name)

        # Cleanup
        del self.q_in
        del self.q_out

        # To try to adjust a too optimistic nb_threads setting
        # if (sum(thread_creation_failed) > 0 or sum(deleted_extr) > 0) and self.nb_threads > 2:
        #   self.nb_threads -= 1

        print_msg = "[{}] Completed update {} in {}s."
        print(print_msg.format(self.pp, update_id, time.time() - start_update))
        sys.stdout.flush()
        self.nb_err = 0

        # Force garbage collection?
        gc.collect()

        # Should we just raise an Exception and restart clean?
        if sum(thread_creation_failed) > 0 or sum(deleted_extr) > 0:
           raise ValueError("Something went wrong. Trying to restart clean")

      except Exception as inst:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fulltb = traceback.format_tb(exc_tb)
        raise type(inst)(" {} ({})".format(inst, ''.join(fulltb)))

  def run(self):
    self.nb_empt = 0
    self.nb_err = 0
    while True:
      self.process_batch()
      print("[ExtractionProcessor: log] Nothing to process at: {}".format(datetime.now().strftime('%Y-%m-%d:%H.%M.%S')))
      sys.stdout.flush()
      time.sleep(10*self.nb_empt)
      self.nb_empt += 1
Exemple #2
0
class ExtractionProcessor(ConfReader):
    """ExtractionProcessor class
  """
    def __init__(self, global_conf, prefix=DEFAULT_EXTR_PROC_PREFIX):
        """ExtractionProcessor constructor

    :param global_conf_in: configuration file or dictionary
    :type global_conf_in: str, dict
    :param prefix: prefix in configuration
    :type prefix: str
    """
        self.extractor = None
        self.nb_empt = 0
        self.nb_err = 0
        self.max_proc_time = 900  # in seconds. Increased for sbcmdline...
        self.url_input = True

        super(ExtractionProcessor, self).__init__(global_conf, prefix)

        # TODO: move that to self.read_conf()
        # Get required parameters
        self.input_type = self.get_required_param("input_type")
        self.nb_threads = self.get_required_param("nb_threads")
        self.featurizer_type = self.get_required_param("featurizer_type")
        self.featurizer_prefix = self.get_required_param("featurizer_prefix")
        self.detector_type = self.get_required_param("detector_type")

        # Get optional parameters
        self.verbose = int(self.get_param("verbose", default=0))
        self.maxucme = int(
            self.get_param("max_up_check_miss_extr",
                           default=MAX_UP_CHECK_MISS_EXTR))
        self.ingestion_input = self.get_param("ingestion_input",
                                              default="kafka")
        self.push_back = self.get_param("push_back", default=False)
        file_input = self.get_param("file_input")
        print("[{}.ExtractionProcessor: log] file_input: {}".format(
            self.pp, file_input))
        if file_input:
            self.url_input = False
        print("[{}.ExtractionProcessor: log] url_input: {}".format(
            self.pp, self.url_input))

        # Means we extract feature from the whole image
        if self.detector_type == "full":
            self.detector = None

        self.extr_prefix = build_extr_str(self.featurizer_type,
                                          self.detector_type, self.input_type)
        self.set_pp()

        # Initialize queues
        self.init_queues()

        # Initialize indexer
        # We now have two indexers:
        # - one "in_indexer" for TF table with buffer, img URLs etc...
        # - one "out_indexer" for our table with extractions etc
        # NB: they could be the same if tables are merged...
        self.out_indexer = HBaseIndexerMinimal(
            self.global_conf, prefix=self.get_required_param("indexer_prefix"))
        self.out_indexer.pp = "ProcOutHBase"
        prefix_in_indexer = self.get_param("in_indexer_prefix", default=False)
        if prefix_in_indexer:
            self.in_indexer = HBaseIndexerMinimal(self.global_conf,
                                                  prefix=prefix_in_indexer)
            self.in_indexer.pp = "ProcInHBase"
            insha1tab = self.in_indexer.table_sha1infos_name
            insha1cfs = self.in_indexer.get_dictcf_sha1_table()
            print("[{}] 'in_indexer' sha1 table {} columns are: {}".format(
                self.pp, insha1tab, insha1cfs))
        else:
            print(
                "[{}] empty 'in_indexer_prefix', using out_indexer as in_indexer too."
                .format(self.pp))
            self.in_indexer = self.out_indexer
            self.in_indexer.pp = "ProcInOutHBase"

        # Initialize extractors only once (just one first)
        self.extractors = []
        # DONE: use 'out_indexer'
        self.extractors.append(
            GenericExtractor(self.detector_type, self.featurizer_type,
                             self.input_type, self.out_indexer.extrcf,
                             self.featurizer_prefix, self.global_conf))

        # DONE: use 'in_indexer'
        if self.url_input:
            self.img_column = self.in_indexer.get_col_imgurl()
        else:
            self.img_column = self.in_indexer.get_col_imgpath()
        img_cols = [
            self.in_indexer.get_col_imgbuff(),
            self.in_indexer.get_col_imgurlbak(), self.img_column
        ]
        print("[{}.ExtractionProcessor: log] img_cols: {}".format(
            self.pp, img_cols))

        self.last_update_date_id = "1970-01-01"
        self.last_missing_extr_date = "1970-01-01"

        # Initialize ingester
        self.ingester = GenericKafkaProcessor(
            self.global_conf,
            prefix=self.get_required_param("proc_ingester_prefix"))
        self.ingester.pp = "ep"

    def set_pp(self, pp="ExtractionProcessor"):
        """Set pretty name

    :param pp: pretty name prefix
    :type pp: str
    """
        self.pp = pp
        if self.extractor:
            self.pp += "_" + self.extr_prefix

    def init_queues(self):
        """Initialize queues list ``self.q_in`` and ``self.q_out``
    """
        from multiprocessing import JoinableQueue
        self.q_in = []
        self.q_out = []
        for _ in range(self.nb_threads):
            self.q_in.append(JoinableQueue(0))
            self.q_out.append(JoinableQueue(0))

    # Should these two methods be in indexer?
    def is_update_unprocessed(self, update_id):
        """Check if an update was not processed yet

    :param update_id: update id
    :type update_id: str
    :return: boolean indicated if update ``update_id`` is unprocessed
    :rtype: bool
    """
        # DONE: use out_indexer
        update_rows = self.out_indexer.get_rows_by_batch(
            [update_id], table_name=self.out_indexer.table_updateinfos_name)
        if update_rows:
            for row in update_rows:
                if self.out_indexer.get_col_upproc() in row[1]:
                    return False
        return True

    def is_update_notstarted(self, update_id, max_delay=None):
        """Check if an update was not started yet

    :param update_id: update id
    :type update_id: str
    :param max_delay: delay (in seconds) between marked start time and now to consider update failed
    :type max_delay: int
    :return: boolean
    :rtype: bool
    """
        # DONE: use out_indexer
        update_rows = self.out_indexer.get_rows_by_batch(
            [update_id], table_name=self.out_indexer.table_updateinfos_name)
        if update_rows:
            for row in update_rows:
                # changed to: self.column_update_started
                #if info_column_family+":"+update_str_started in row[1]:
                #if self.column_update_started in row[1]:
                # DONE: use out_indexer
                if self.out_indexer.get_col_upstart() in row[1]:
                    if max_delay:
                        start_str = row[1][self.out_indexer.get_col_upstart()]
                        # start time format is '%Y-%m-%d:%H.%M.%S'
                        start_dt = datetime.strptime(start_str,
                                                     '%Y-%m-%d:%H.%M.%S')
                        now_dt = datetime.now()
                        diff_dt = now_dt - start_dt
                        if diff_dt.total_seconds() > max_delay:
                            return True
                    return False
        return True

    def get_batch_hbase(self):
        """Get one batch of images from HBase

    :yield: tuple (rows_batch, update_id)
    """
        # legacy implementation: better to have a kafka topic for batches to be processed to allow
        # safe and efficient parallelization on different machines
        # DONE: use in_indexer
        img_cols = [
            self.in_indexer.get_col_imgbuff(),
            self.in_indexer.get_col_imgurlbak(), self.img_column
        ]
        try:
            # DONE: use out_indexer
            for updates in self.out_indexer.get_unprocessed_updates_from_date(
                    self.last_update_date_id, extr_type=self.extr_prefix):
                for update_id, update_cols in updates:
                    if self.extr_prefix in update_id:
                        # double check update has not been processed somewhere else
                        if self.is_update_unprocessed(update_id):
                            # double check update was not marked as started recently i.e. by another process
                            if self.is_update_notstarted(
                                    update_id, max_delay=TIME_ELAPSED_FAILED):
                                # DONE: use out_indexer
                                list_sha1s = update_cols[
                                    self.out_indexer.get_col_listsha1s(
                                    )].split(',')
                                msg = "[{}.get_batch_hbase: log] Update {} has {} images."
                                print(
                                    msg.format(self.pp, update_id,
                                               len(list_sha1s)))
                                # We should time that, it seems slow i.e. 2/3 minutes per update.
                                try:
                                    rows_batch = self.in_indexer.get_columns_from_sha1_rows(
                                        list_sha1s,
                                        rbs=BATCH_SIZE_IMGBUFFER,
                                        columns=img_cols)
                                except Exception:
                                    msg = "[{}.get_batch_hbase: warning] Failed retrieving images data for update: {}"
                                    print(msg.format(self.pp, update_id))
                                    # flush?
                                    sys.stdout.flush()
                                    # Update self.last_update_date_id ?
                                    #self.last_update_date_id = '_'.join(update_id.split('_')[-2:])
                                    continue
                                # print "rows_batch", rows_batch
                                if rows_batch:
                                    yield rows_batch, update_id
                                else:
                                    msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}"
                                    print(msg.format(self.pp, update_id))
                                    #msg = "[{}.get_batch_hbase: log] Was trying to read columns {} from table {} for rows {}"
                                    #print(msg.format(self.pp, img_cols, self.in_indexer.table_sha1infos_name, list_sha1s))
                                # Store last update id
                                self.last_update_date_id = '_'.join(
                                    update_id.split('_')[-2:])
                            else:
                                msg = "[{}.get_batch_hbase: log] Skipping update started recently: {}"
                                print(msg.format(self.pp, update_id))
                                continue
                        else:
                            msg = "[{}.get_batch_hbase: log] Skipping already processed update: {}"
                            print(msg.format(self.pp, update_id))
                            continue
                    else:
                        if self.verbose > 6:
                            msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type."
                            print(msg.format(self.pp, update_id))
            else:
                print("[{}.get_batch_hbase: log] No unprocessed update found.".
                      format(self.pp))
                # Should we reinitialized self.last_update_date_id?
                # Look for updates that have some unprocessed images
                # TODO: wether we do that or not could be specified by a parameter
                # as this induces slow down during update...
                # DONE: use out_indexer
                count_ucme = 0
                stop_cme = False
                for updates in self.out_indexer.get_missing_extr_updates_from_date(
                        self.last_missing_extr_date,
                        extr_type=self.extr_prefix):
                    for update_id, update_cols in updates:
                        if self.extr_prefix in update_id:
                            # DONE: use out_indexer
                            if self.out_indexer.get_col_listsha1s(
                            ) in update_cols:
                                list_sha1s = update_cols[
                                    self.out_indexer.get_col_listsha1s(
                                    )].split(',')
                                msg = "[{}.get_batch_hbase: log] Update {} has {} images missing extractions."
                                print(
                                    msg.format(self.pp, update_id,
                                               len(list_sha1s)))
                                sys.stdout.flush()
                                # also get 'ext:' to check if extraction was already processed?
                                # DONE: use in_indexer
                                rows_batch = self.in_indexer.get_columns_from_sha1_rows(
                                    list_sha1s,
                                    rbs=BATCH_SIZE_IMGBUFFER,
                                    columns=img_cols)
                                if rows_batch:
                                    yield rows_batch, update_id
                                    self.last_missing_extr_date = '_'.join(
                                        update_id.split('_')[-2:])
                                    count_ucme += 1
                                    if count_ucme >= self.maxucme:
                                        stop_cme = True
                                        break
                                else:
                                    msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}"
                                    print(msg.format(self.pp, update_id))
                            else:
                                msg = "[{}.get_batch_hbase: log] Update {} has no images list."
                                print(msg.format(self.pp, update_id))
                        else:
                            msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type."
                            print(msg.format(self.pp, update_id))
                    # We have reached maximum number of check for missing extractions in one call
                    if stop_cme:
                        break
                else:
                    if stop_cme:
                        msg = "[{}.get_batch_hbase: log] Stopped checking updates with missing extractions"
                        msg += "after founding {}/{}."
                        print(
                            msg.format(self.pp, count_ucme, self.maxucme,
                                       self.last_missing_extr_date))
                        msg = "[{}.get_batch_hbase: log] Will restart next time from: {}"
                        print(msg.format(self.pp, self.last_missing_extr_date))
                        sys.stdout.flush()
                    else:
                        msg = "[{}.get_batch_hbase: log] No updates with missing extractions found."
                        print(msg.format(self.pp))
                        sys.stdout.flush()
                        # Re-initialize dates just to make sure we don't miss anything
                        self.last_update_date_id = "1970-01-01"
                        self.last_missing_extr_date = "1970-01-01"

        except Exception as inst:
            # If we reach this point it is really a succession of failures
            full_trace_error("[{}.get_batch_hbase: error] {}".format(
                self.pp, inst))
            # Raise Exception to restart process or docker
            raise inst

    def get_batch_kafka(self):
        """Get one batch of images from Kafka

    :yield: tuple (rows_batch, update_id)
    """
        # Read from a kafka topic to allow safer parallelization on different machines
        # DONE: use in_indexer
        img_cols = [
            self.in_indexer.get_col_imgbuff(),
            self.in_indexer.get_col_imgurlbak(), self.img_column
        ]
        try:
            # Needs to read topic to get update_id and list of sha1s
            if self.ingester.consumer:
                for msg in self.ingester.consumer:
                    msg_dict = json.loads(msg.value)
                    update_id = msg_dict.keys()[0]
                    # NB: Try to get update info and check it was really not processed yet.
                    if self.is_update_unprocessed(update_id):
                        str_list_sha1s = msg_dict[update_id]
                        list_sha1s = str_list_sha1s.split(',')
                        msg = "[{}.get_batch_kafka: log] Update {} has {} images."
                        print(msg.format(self.pp, update_id, len(list_sha1s)))
                        if self.verbose > 3:
                            msg = "[{}.get_batch_kafka: log] Looking for columns: {}"
                            print(msg.format(self.pp, img_cols))
                        # DONE: use in_indexer
                        #rows_batch = self.in_indexer.get_columns_from_sha1_rows(list_sha1s, columns=img_cols)
                        rows_batch = self.in_indexer.get_columns_from_sha1_rows(
                            list_sha1s,
                            rbs=BATCH_SIZE_IMGBUFFER,
                            columns=img_cols)
                        #print "rows_batch", rows_batch
                        if rows_batch:
                            if self.verbose > 4:
                                msg = "[{}.get_batch_kafka: log] Yielding for update: {}"
                                print(msg.format(self.pp, update_id))
                            yield rows_batch, update_id
                            self.ingester.consumer.commit()
                            if self.verbose > 4:
                                msg = "[{}.get_batch_kafka: log] After yielding for update: {}"
                                print(msg.format(self.pp, update_id))
                            self.last_update_date_id = '_'.join(
                                update_id.split('_')[-2:])
                        # Should we try to commit offset only at this point?
                        else:
                            msg = "[{}.get_batch_kafka: log] Did not get any image buffers for the update: {}"
                            print(msg.format(self.pp, update_id))
                    else:
                        msg = "[{}.get_batch_kafka: log] Skipping already processed update: {}"
                        print(msg.format(self.pp, update_id))
                else:
                    print("[{}.get_batch_kafka: log] No update found.".format(
                        self.pp))
                    # Fall back to checking HBase for unstarted/unfinished updates
                    for rows_batch, update_id in self.get_batch_hbase():
                        yield rows_batch, update_id
            else:
                print("[{}.get_batch_kafka: log] No consumer found.".format(
                    self.pp))
                # Fall back to checking HBase for unstarted/unfinished updates
                for rows_batch, update_id in self.get_batch_hbase():
                    yield rows_batch, update_id
        except Exception as inst:
            # If we reach this point it is really a succession of failures
            full_trace_error("[{}.get_batch_kafka: error] {}".format(
                self.pp, inst))
            # Raise Exception to restart process or docker
            raise inst

    def get_batch(self):
        """Get one batch of images

    :yield: tuple (rows_batch, update_id)
    """
        if self.ingestion_input == "hbase":
            for rows_batch, update_id in self.get_batch_hbase():
                yield rows_batch, update_id
        else:
            for rows_batch, update_id in self.get_batch_kafka():
                yield rows_batch, update_id

    def process_batch(self):
        """Process one batch of images

    :raises Exception: if something goes really wrong
    """
        # Get a new update batch
        try:
            for rows_batch, update_id in self.get_batch():
                start_update = time.time()
                img_list_size = len(rows_batch)
                print("[{}] Processing update {} of {} rows.".format(
                    self.pp, update_id, img_list_size))
                sys.stdout.flush()

                # Initialize
                self.nb_empt = 0
                self.init_queues()
                threads = []

                # If we have deleted an extractor at some point or for first batch
                nb_extr_to_create = self.nb_threads - len(self.extractors)
                if nb_extr_to_create:
                    start_create_extractor = time.time()
                    while len(self.extractors) < min(self.nb_threads,
                                                     img_list_size):
                        # DONE: use 'out_indexer'
                        self.extractors.append(
                            GenericExtractor(self.detector_type,
                                             self.featurizer_type,
                                             self.input_type,
                                             self.out_indexer.extrcf,
                                             self.featurizer_prefix,
                                             self.global_conf))
                    msg = "[{}] Created {} extractors in {}s."
                    create_extr_time = time.time() - start_create_extractor
                    print(
                        msg.format(self.pp, len(self.extractors),
                                   create_extr_time))

                # Mark batch as started to be process
                now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S')
                # changed to: self.column_update_started
                #dict_val = {info_column_family + ':' + update_str_started: now_str}
                #dict_val = {self.column_update_started: now_str}
                # DONE: use out_indexer
                dict_val = {self.out_indexer.get_col_upstart(): now_str}
                update_started_dict = {update_id: dict_val}
                # DONE: use out_indexer
                self.out_indexer.push_dict_rows(
                    dict_rows=update_started_dict,
                    table_name=self.out_indexer.table_updateinfos_name)

                # TODO: define a get_buffer_images method
                # --------
                # Push images to queue
                list_in = []
                # For parallelized downloading...
                from Queue import Queue
                nb_imgs_dl = 0
                q_in_dl = Queue(0)
                q_out_dl = Queue(0)

                start_get_buffer = time.time()
                # DONE: use in_indexer in all this scope
                # How could we transfer URL from in table to out table if they are different?...
                for img in rows_batch:
                    # should decode base64
                    #if img_buffer_column in img[1]:
                    if self.in_indexer.get_col_imgbuff() in img[1]:
                        # That's messy...
                        # b64buffer = buffer_to_B64(cStringIO.StringIO(img[1][self.in_indexer.get_col_imgbuff()]))
                        # use img[1].pop(self.in_indexer.get_col_imgbuff())
                        b64buffer = buffer_to_B64(
                            cStringIO.StringIO(img[1].pop(
                                self.in_indexer.get_col_imgbuff())))
                        tup = (img[0], b64buffer, False)
                        list_in.append(tup)
                    else:
                        # need to re-download, accumulate a list of URLs to download
                        if self.verbose > 5:
                            msg = "[{}: log] Will try to download image {} without buffer"
                            print(msg.format(self.pp, img[0]))
                        # Deal with img_path_column for local_images_kafka_pusher
                        if self.img_column in img[1]:
                            q_in_dl.put((img[0], img[1][self.img_column],
                                         self.push_back))
                            nb_imgs_dl += 1
                        elif self.in_indexer.get_col_imgurlbak() in img[1]:
                            q_in_dl.put(
                                (img[0],
                                 img[1][self.in_indexer.get_col_imgurlbak()],
                                 self.push_back))
                            nb_imgs_dl += 1
                        else:
                            msg = "[{}: warning] No buffer and no URL/path for image {} !"
                            print(msg.format(self.pp, img[0]))
                            continue

                # At this point we can delete rows_batch
                del rows_batch
                gc.collect()

                # Download missing images
                nb_dl = 0
                nb_dl_failed = 0
                if nb_imgs_dl > 0:
                    threads_dl = []
                    for i in range(min(self.nb_threads, nb_imgs_dl)):
                        # should read (url, obj_pos) from self.q_in
                        # and push (url, obj_pos, buffer, img_info, start_process, end_process) to self.q_out
                        thread = ThreadedDownloaderBufferOnly(
                            q_in_dl, q_out_dl, url_input=self.url_input)
                        thread.start()
                        threads_dl.append(thread)

                    q_in_dl.join()

                    # Push downloaded images to list_in too
                    while nb_dl < nb_imgs_dl:
                        # This can block?
                        #sha1, buffer, push_back, inst = q_out_dl.get()
                        try:
                            sha1, buffer, push_back, inst = q_out_dl.get(
                                True, 10)
                        except Exception as queue_err:
                            msg = "[{}: error] Download queue out timed out: {}"
                            print(msg.format(self.pp, queue_err))
                            break
                        nb_dl += 1
                        if inst:
                            if self.verbose > 6:
                                msg = "[{}: log] Could not download image {}, error was: {}"
                                print(msg.format(self.pp, sha1, inst))
                            nb_dl_failed += 1
                        else:
                            if buffer:
                                list_in.append((sha1, buffer, push_back))
                            else:
                                # Is that even possible?
                                msg = "[{}: error] No error but no buffer either for image {}"
                                print(msg.format(self.pp, sha1))

                get_buffer_time = time.time() - start_get_buffer
                buff_list_size = len(list_in)
                msg = "[{}] Got {}/{} image buffers ({}/{} downloaded) for update {} in {}s."
                print(
                    msg.format(self.pp, buff_list_size, img_list_size,
                               nb_dl - nb_dl_failed, nb_dl, update_id,
                               get_buffer_time))
                sys.stdout.flush()

                # --------
                # if buff_list_size == 0, we shouldn't try to process anything, just mark update as processed
                if buff_list_size != 0:

                    # TODO: define a get_features method
                    # --------
                    q_batch_size = int(
                        math.ceil(float(buff_list_size) / self.nb_threads))
                    for i, q_batch in enumerate(
                            build_batch(list_in, q_batch_size)):
                        self.q_in[i].put(q_batch)

                    # At this point we can delete list_in
                    del list_in
                    gc.collect()

                    q_in_size = []
                    q_in_size_tot = 0
                    for i in range(self.nb_threads):
                        q_in_size.append(self.q_in[i].qsize())
                        q_in_size_tot += q_in_size[i]
                    if self.verbose > 5:
                        print("[{}] Total input queues sizes is: {}".format(
                            self.pp, q_in_size_tot))

                    # Start daemons...
                    thread_creation_failed = [0] * self.nb_threads
                    for i in range(self.nb_threads):
                        # one per non empty input queue
                        if q_in_size[i] > 0:
                            try:
                                thread = DaemonBatchExtractor(
                                    self.extractors[i],
                                    self.q_in[i],
                                    self.q_out[i],
                                    verbose=self.verbose)
                                # Could get a 'Cannot allocate memory' if we are using too many threads...
                                thread.start()
                                threads.append(thread)
                            except OSError as inst:
                                # Should we try to push self.q_in[i] data to some other thread?
                                msg = "[{}.process_batch: error] Could not start thread #{}: {}"
                                print(msg.format(self.pp, i + 1, inst))
                                thread_creation_failed[i] = 1
                                time.sleep(sum(thread_creation_failed))

                    if sum(thread_creation_failed) == self.nb_threads:
                        # We are in trouble...
                        raise RuntimeError("Could not start any thread...")

                    nb_threads_running = len(threads)
                    deleted_extr = [0] * nb_threads_running
                    start_process = time.time()
                    stop = time.time() + self.max_proc_time
                    # Wait for all tasks to be marked as done
                    threads_finished = [0] * nb_threads_running
                    thread_msg = "[{}] Thread {}/{} (pid: {}) "
                    while sum(threads_finished) < nb_threads_running:
                        for i in range(nb_threads_running):
                            if sum(threads_finished) == nb_threads_running:
                                sys.stdout.flush()
                                break
                            if threads_finished[i] == 1:
                                continue
                            i_q_in = i + sum(thread_creation_failed[:i + 1])
                            if q_in_size[i_q_in] > 0:
                                # This seems to block forever sometimes, if subprocess crashed?...
                                #self.q_in[i].join()
                                # Manual join with timeout...
                                # https://github.com/python/cpython/blob/3.6/Lib/multiprocessing/queues.py
                                if not self.q_in[
                                        i_q_in]._unfinished_tasks._semlock._is_zero(
                                        ) and time.time() < stop:
                                    time.sleep(1)
                                else:
                                    if self.q_in[
                                            i_q_in]._unfinished_tasks._semlock._is_zero(
                                            ):
                                        if self.verbose > 5:
                                            msg = thread_msg + "marked as finished because processing seems finished"
                                            print(
                                                msg.format(
                                                    self.pp, i + 1,
                                                    nb_threads_running,
                                                    threads[i].pid))
                                    else:
                                        # Try to stop processing
                                        threads[i].killed = True
                                        if self.verbose > 0:
                                            # In this cases does this happen...
                                            msg = thread_msg + "killed as max_proc_time ({}) has passed."
                                            print(
                                                msg.format(
                                                    self.pp, i + 1,
                                                    nb_threads_running,
                                                    threads[i].pid,
                                                    self.max_proc_time))
                                            sys.stdout.flush()
                                            # Try to delete corresponding extractor to free memory?
                                            # And reduce number of threads at the end of the loop
                                        try:
                                            # This can block?
                                            #self.q_in[i_q_in].task_done()
                                            if deleted_extr[i] == 0:
                                                # we pushed the extractor as self.extractors[i] in a loop of self.nb_threads
                                                # we use i_q_in
                                                #del self.extractors[i_q_in]
                                                deleted_extr[i] = 1
                                                del self.extractors[
                                                    i_q_in -
                                                    sum(deleted_extr[:i + 1])]
                                                #del threads[i - sum(deleted_extr[:i+1])]
                                        except Exception:
                                            pass
                                    threads_finished[i] = 1
                            else:
                                if self.verbose > 2:
                                    # We actually never gave something to process...
                                    msg = thread_msg + "marked as finished because no data was passed to it"
                                    print(
                                        msg.format(self.pp, i + 1,
                                                   nb_threads_running,
                                                   threads[i].pid))
                                threads_finished[i] = 1

                    # Cleanup threads to free memory before getting data back
                    # Daemon may still be running...
                    # and will actually be deleted only when they exit after not getting a batch
                    del threads

                    # Gather results
                    q_out_size = []
                    q_out_size_tot = 0
                    for i in range(self.nb_threads):
                        q_out_size.append(self.q_out[i].qsize())
                        q_out_size_tot += q_out_size[i]

                    if self.verbose > 5:
                        print(
                            "[{}: log] Total output queues size is: {}".format(
                                self.pp, q_out_size_tot))
                        sys.stdout.flush()

                    # Can get stuck here?
                    dict_imgs = dict()
                    if q_out_size_tot > 0:
                        for i in range(self.nb_threads):
                            if self.verbose > 4:
                                print("[{}] Thread {} q_out_size: {}".format(
                                    self.pp, i + 1, q_out_size[i]))
                                sys.stdout.flush()
                            while q_out_size[i] > 0 and not self.q_out[
                                    i].empty():
                                if self.verbose > 6:
                                    print("[{}] Thread {} q_out is not empty.".
                                          format(self.pp, i + 1))
                                    sys.stdout.flush()
                                try:
                                    # This can still block forever?
                                    #batch_out = self.q_out[i].get(True, 10)
                                    batch_out = self.q_out[i].get_nowait()
                                    if self.verbose > 4:
                                        msg = "[{}] Got batch of {} features from thread {} q_out."
                                        print(
                                            msg.format(self.pp, len(batch_out),
                                                       i + 1))
                                        sys.stdout.flush()
                                    for sha1, dict_out in batch_out:
                                        dict_imgs[sha1] = dict_out
                                except:
                                    if self.verbose > 1:
                                        print(
                                            "[{}] Thread {} failed to get from q_out: {}"
                                            .format(self.pp, i + 1))
                                        sys.stdout.flush()
                                    #pass
                                if self.verbose > 4:
                                    print(
                                        "[{}] Marking task done in q_out of thread {}."
                                        .format(self.pp, i + 1))
                                    sys.stdout.flush()
                                self.q_out[i].task_done()

                    #if self.verbose > 0:
                    print_msg = "[{}] Got features for {}/{} images in {}s."
                    proc_time = time.time() - start_process
                    print(
                        print_msg.format(self.pp, len(dict_imgs.keys()),
                                         buff_list_size, proc_time))
                    sys.stdout.flush()
                    # --------

                    if len(dict_imgs.keys()) > 0:
                        # Push computed features
                        self.out_indexer.push_dict_rows(
                            dict_rows=dict_imgs,
                            table_name=self.out_indexer.table_sha1infos_name)

                else:
                    msg = "[{}: Warning] Could not get any image buffer (out of {} requested) for update {}"
                    print(msg.format(self.pp, img_list_size, update_id))
                    dict_imgs = dict()
                    nb_threads_running = len(threads)
                    thread_creation_failed = [0] * self.nb_threads
                    deleted_extr = [0] * nb_threads_running

                # Mark batch as processed
                now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S')
                # DONE: use out_indexer
                update_processed_dict = {
                    update_id: {
                        self.out_indexer.get_col_upproc(): now_str
                    }
                }
                self.out_indexer.push_dict_rows(
                    dict_rows=update_processed_dict,
                    table_name=self.out_indexer.table_updateinfos_name)

                # Mark as completed if all rows had an extraction
                if img_list_size == len(dict_imgs.keys()):
                    # DONE: use out_indexer
                    update_completed_dict = {
                        update_id: {
                            self.out_indexer.get_col_upcomp(): str(1)
                        }
                    }
                    self.out_indexer.push_dict_rows(
                        dict_rows=update_completed_dict,
                        table_name=self.out_indexer.table_updateinfos_name)

                # Cleanup
                del self.q_in
                del self.q_out

                # To try to adjust a too optimistic nb_threads setting
                # if (sum(thread_creation_failed) > 0 or sum(deleted_extr) > 0) and self.nb_threads > 2:
                #   self.nb_threads -= 1

                msg = "[{}] Completed update {} in {}s."
                print(
                    msg.format(self.pp, update_id,
                               time.time() - start_update))
                sys.stdout.flush()
                self.nb_err = 0

                # Force garbage collection
                gc.collect()

                # Should we just raise an Exception and restart clean?
                if sum(thread_creation_failed) > 0 or sum(deleted_extr) > 0:
                    # To try to adjust a too optimistic nb_threads setting
                    if self.nb_threads > 1:
                        self.nb_threads -= 1
                        self.extractors = []
                        gc.collect()
                    else:
                        raise RuntimeError(
                            "Processed failed with a single thread...")

        except Exception as inst:
            #exc_type, exc_obj, exc_tb = sys.exc_info()
            #fulltb = traceback.format_tb(exc_tb)
            print("[{}.process_batch: ERROR] {}".format(self.pp, inst))
            #print("[{}] {} ({})".format(self.pp, inst, ''.join(fulltb)))
            # Things are likely to be very bad at that point... Docker should be restarted
            #if self.nb_threads == 2:
            raise inst
            #raise type(inst)(" {} ({})".format(inst, ''.join(fulltb)))

    def run(self):
        """Run processor
    """
        self.nb_empt = 0
        self.nb_err = 0
        while True:
            self.process_batch()
            msg = "[ExtractionProcessor: log] Nothing to process at: {}"
            print(msg.format(datetime.now().strftime('%Y-%m-%d:%H.%M.%S')))
            sys.stdout.flush()
            time.sleep(10 * min(self.nb_empt, 60))
            self.nb_empt += 1