Example #1
0
 def setup(self):
     self.session = boto3.Session(profile_name=self.aws_profile)
     self.s3 = self.session.resource('s3')
     # Try to access first to make sure
     try:
         self.s3.meta.client.head_bucket(Bucket=self.bucket_name)
     except botocore.exceptions.ClientError as e:
         err_msg = "[{}: error] Could not check bucket {} using profile {}"
         full_trace_error(
             err_msg.format(self.pp, self.bucket_name, self.aws_profile))
         raise e
     self.bucket = self.s3.Bucket(self.bucket_name)
     if self.verbose > 0:
         print "[{}: log] Initialized with bucket '{}' and profile '{}'.".format(
             self.pp, self.bucket_name, self.aws_profile)
 def get_batch_kafka(self):
   # Read from a kafka topic to allow safer parallelization on different machines
   try:
     # Needs to read topic to get update_id and list of sha1s
     for msg in self.ingester.consumer:
       msg_dict = json.loads(msg.value)
       update_id = msg_dict.keys()[0]
       # NB: Try to get update info and check it was really not processed yet.
       if self.is_update_unprocessed(update_id):
         str_list_sha1s = msg_dict[update_id]
         list_sha1s = str_list_sha1s.split(',')
         print("[{}.get_batch_kafka: log] Update {} has {} images.".format(self.pp, update_id, len(list_sha1s)))
         # NB: we could also get 'ext:' of images to double check if extraction was already processed
         #rows_batch = self.indexer.get_columns_from_sha1_rows(list_sha1s, columns=["info:img_buffer"])
         if self.verbose > 3:
           print("[{}.get_batch_kafka: log] Looking for colums: {}".format(self.pp, [img_buffer_column, self.img_column]))
         rows_batch = self.indexer.get_columns_from_sha1_rows(list_sha1s, columns=[img_buffer_column, self.img_column])
         #print "rows_batch", rows_batch
         if rows_batch:
           if self.verbose > 4:
             print("[{}.get_batch_kafka: log] Yielding for update: {}".format(self.pp, update_id))
           yield rows_batch, update_id
           self.ingester.consumer.commit()
           if self.verbose > 4:
             print("[{}.get_batch_kafka: log] After yielding for update: {}".format(self.pp, update_id))
           self.last_update_date_id = '_'.join(update_id.split('_')[-2:])
         # Should we try to commit offset only at this point?
         else:
           print("[{}.get_batch_kafka: log] Did not get any image buffers for the update: {}".format(self.pp, update_id))
       else:
         print("[{}.get_batch_kafka: log] Skipping already processed update: {}".format(self.pp, update_id))
     else:
       print("[{}.get_batch_kafka: log] No update found.".format(self.pp))
       # Fall back to checking HBase for unstarted/unfinished updates
       for rows_batch, update_id in self.get_batch_hbase():
         yield rows_batch, update_id
   except Exception as inst:
     full_trace_error("[{}.get_batch_kafka: error] {}".format(self.pp, inst))
  def get_batch_hbase(self):
    # legacy implementation: better to have a kafka topic for batches to be processed to allow
    #       safe parallelization on different machines
    try:
      for updates in self.indexer.get_unprocessed_updates_from_date(self.last_update_date_id,
                                                                    extr_type=self.extr_prefix):
        for update_id, update_cols in updates:
          if self.extr_prefix in update_id:
            # double check update has not been processed somewhere else
            if self.is_update_unprocessed(update_id):
              # double check update was not marked as started recently i.e. by another process
              if self.is_update_notstarted(update_id, max_delay=TIME_ELAPSED_FAILED):
                list_sha1s = update_cols[column_list_sha1s].split(',')
                log_msg = "[{}.get_batch_hbase: log] Update {} has {} images."
                print(log_msg.format(self.pp, update_id, len(list_sha1s)))
                # also get 'ext:' to check if extraction was already processed?
                rows_batch = self.indexer.get_columns_from_sha1_rows(list_sha1s,
                                                                     columns=[img_buffer_column,
                                                                              self.img_column])
                # print "rows_batch", rows_batch
                if rows_batch:
                  yield rows_batch, update_id
                  self.last_update_date_id = '_'.join(update_id.split('_')[-2:])
                else:
                  log_msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}"
                  print(log_msg.format(self.pp, update_id))
              else:
                log_msg = "[{}.get_batch_hbase: log] Skipping update started recently: {}"
                print(log_msg.format(self.pp, update_id))
                continue
            else:
              log_msg = "[{}.get_batch_hbase: log] Skipping already processed update: {}"
              print(log_msg.format(self.pp, update_id))
              continue
          else:
            if self.verbose > 6:
              log_msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type."
              print(log_msg.format(self.pp, update_id))
      else:
        print("[{}.get_batch_hbase: log] No unprocessed update found.".format(self.pp))
        # Look for updates that have some unprocessed images
        # TODO: wether we do that or not could be specified by a parameter
        # as this induces slow down during update...
        for updates in self.indexer.get_missing_extr_updates_from_date("1970-01-01",
                                                                       extr_type=self.extr_prefix):
          for update_id, update_cols in updates:
            if self.extr_prefix in update_id:
              if column_list_sha1s in update_cols:
                list_sha1s = update_cols[column_list_sha1s].split(',')
                log_msg = "[{}.get_batch_hbase: log] Update {} has {} images missing extractions."
                print(log_msg.format(self.pp, update_id, len(list_sha1s)))
                sys.stdout.flush()
                # also get 'ext:' to check if extraction was already processed?
                rows_batch = self.indexer.get_columns_from_sha1_rows(list_sha1s,
                                                                     columns=[img_buffer_column,
                                                                              self.img_column])
                if rows_batch:
                  yield rows_batch, update_id
                else:
                  log_msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}"
                  print(log_msg.format(self.pp, update_id))
              else:
                log_msg = "[{}.get_batch_hbase: log] Update {} has no images list."
                print(log_msg.format(self.pp, update_id))
            else:
              log_msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type."
              print(log_msg.format(self.pp, update_id))
        else:
          log_msg = "[{}.get_batch_hbase: log] No updates with missing extractions found."
          print(log_msg.format(self.pp))
          sys.stdout.flush()

    except Exception as inst:
      full_trace_error("[{}.get_batch_hbase: error] {}".format(self.pp, inst))
    def init_searcher(self):
        """ Initialize LOPQ model and searcher from `global_conf` value.
    """
        try:
            # Try to load pretrained model from storer
            lopq_model = self.storer.load(self.build_model_str())
            if lopq_model is None:
                raise ValueError("Could not load model from storer.")
            # if self.verbose > 1:
            #   print("pca_mu.shape: {}".format(lopq_model.pca_mu.shape))
            #   print("pca_P.shape: {}".format(lopq_model.pca_P.shape))
        except Exception as inst:
            if type(inst) != ValueError:
                full_trace_error(inst)
            print("[{}: log] Looks like model was not trained yet ({})".format(
                self.pp, inst))

            self.loaded_pretrain_model = False
            # Try to get it from public bucket e.g.:
            # https://s3-us-west-2.amazonaws.com/dig-cu-imagesearchindex/sbpycaffe_feat_full_image_lopq_pca-pca256-subq256-M8-V256_train100000
            if self.get_pretrained_model:
                log_msg = "[{}: log] Trying to retrieve pre-trained model {} from s3"
                print(log_msg.format(self.pp, self.build_model_str()))
                from ..common.dl import download_file
                import pickle
                try:
                    base_model_path = "https://s3-us-west-2.amazonaws.com/dig-cu-imagesearchindex/"
                    # This can fail with a "retrieval incomplete: got only" ...
                    download_file(base_model_path + self.build_model_str(),
                                  self.build_model_str())
                    lopq_model = pickle.load(open(self.build_model_str(),
                                                  'rb'))
                    # Avoid overwritting the model in s3 with s3storer using dig-cu-imagesearchindex bucket
                    is_s3_storer = isinstance(self.storer, S3Storer)
                    if is_s3_storer and self.storer.bucket_name == "dig-cu-imagesearchindex":
                        log_msg = "[{}: log] Skipping saving model {} back to s3"
                        print(log_msg.format(self.pp, self.build_model_str()))
                    else:
                        log_msg = "[{}: log] Saving model {} to storer"
                        print(log_msg.format(self.pp, self.build_model_str()))
                        self.storer.save(self.build_model_str(), lopq_model)
                    log_msg = "[{}: log] Loaded pretrained model {} from s3"
                    print(log_msg.format(self.pp, self.build_model_str()))
                    self.loaded_pretrain_model = True
                except Exception as inst:
                    log_msg = "[{}: log] Could not loaded pretrained model {} from s3: {}"
                    #print(log_msg.format(self.pp, self.build_model_str(), inst))
                    full_trace_error(
                        log_msg.format(self.pp, self.build_model_str(), inst))
                    sys.stdout.flush()
            else:
                log_msg = "[{}: log] Skipped retrieving pre-trained model from s3 as requested."
                print(log_msg.format(self.pp, self.build_model_str()))

            if not self.loaded_pretrain_model:
                # This is from our modified LOPQ package...
                # https://github.com/ColumbiaDVMM/ColumbiaImageSearch/tree/master/workflows/build-lopq-index/lopq/python
                # 'LOPQModelPCA' could be the type of the model loaded from pickle file
                # from lopq.model import LOPQModel, LOPQModelPCA
                # Size of DB should depend on nb_train... How should we properly set the size of this?
                # It should be nb_train_pca * size_feat + nb_train * size_feat_pca
                feat_size = get_feat_size(self.featurizer_type)
                if self.model_type == "lopq_pca":
                    map_size = self.nb_train_pca * feat_size * 4 * 8
                    map_size += self.nb_train * self.model_params['pca'] * 4 * 8
                else:
                    map_size = self.nb_train * feat_size * 4 * 8
                self.save_feat_env = lmdb.open('/data/lmdb_feats_' +
                                               self.build_model_str(),
                                               map_size=int(1.1 * map_size),
                                               writemap=True,
                                               map_async=True,
                                               max_dbs=2)

                # Train and save model in save_path folder
                lopq_model = self.train_index()
                # TODO: we could build a more unique model identifier
                # (using domain information? sha1/md5 of model parameters? using date of training?)
                # that would also mean we should list from the storer and guess
                # (based on date of creation) the correct model above...
                self.storer.save(self.build_model_str(), lopq_model)

        # Setup searcher with LOPQ model
        if lopq_model:
            # LOPQSearcherLMDB is now the default, as it makes the index more persistent
            # and potentially more easily usable with multiple processes.
            if self.lopq_searcher == "LOPQSearcherLMDB":
                from lopq.search import LOPQSearcherLMDB
                # TODO: should we get path from a parameter? and/or add model_str to it?
                # self.searcher = LOPQSearcherLMDB(lopq_model, lmdb_path='./lmdb_index/', id_lambda=str)
                # self.updates_env = lmdb.open('./lmdb_updates/', map_size=1024 * 1000000 * 1, writemap=True, map_async=True, max_dbs=1)
                self.searcher = LOPQSearcherLMDB(
                    lopq_model,
                    lmdb_path='/data/lmdb_index_' + self.build_model_str(),
                    id_lambda=str)
                # How could we properly set the size of this?
                self.updates_env = lmdb.open('/data/lmdb_updates_' +
                                             self.build_model_str(),
                                             map_size=1024 * 1000000 * 1,
                                             writemap=True,
                                             map_async=True,
                                             max_dbs=1)
                self.updates_index_db = self.updates_env.open_db("updates")
            elif self.lopq_searcher == "LOPQSearcher":
                from lopq.search import LOPQSearcher
                self.searcher = LOPQSearcher(lopq_model)
            else:
                raise ValueError("Unknown 'lopq_searcher' type: {}".format(
                    self.lopq_searcher))
if __name__ == "__main__":

  # Get conf file
  parser = ArgumentParser()
  parser.add_argument("-c", "--conf", dest="conf_file", required=True)
  parser.add_argument("-p", "--prefix", dest="prefix", default=default_extr_proc_prefix)
  options = parser.parse_args()

  # TODO: should we daemonize that too?

  # Initialize extraction processor
  ep = ExtractionProcessor(options.conf_file, prefix=options.prefix)
  nb_err = 0

  print("Extraction processor options are: {}".format(options))
  sys.stdout.flush()

  while True:
    try:
      ep.run()
      nb_err = 0
    except Exception as inst:
      full_trace_error("Extraction processor failed: {}".format(inst))
      sys.stdout.flush()
      break
      #raise inst
      # del ep
      # gc.collect()
      # time.sleep(10*nb_err)
      # ep = ExtractionProcessor(options.conf_file, prefix=options.prefix)
      # nb_err += 1
    def get_train_features(self,
                           nb_features,
                           lopq_pca_model=None,
                           nb_min_train=None):
        if nb_min_train is None:
            nb_min_train = nb_features
        if lopq_pca_model:
            feats_db = self.save_feat_env.open_db("feats_pca")
            dtype = np.float32
        else:
            feats_db = self.save_feat_env.open_db("feats")
            from ..featurizer.featsio import get_feat_dtype
            dtype = get_feat_dtype(self.featurizer_type)
        nb_saved_feats = self.get_nb_saved_feats(feats_db)
        nb_features_to_read = nb_features

        seen_updates = set()

        if nb_saved_feats < nb_features:
            print("[{}: log] Gathering {} training samples...".format(
                self.pp, nb_features))
            sys.stdout.flush()
            start_date = "1970-01-01"
            done = False
            # Accumulate until we have enough features, or we have read all features if 'wait_for_nbtrain'
            # is false and we have at least nb_min_train
            while not done:
                for batch_updates in self.indexer.get_updates_from_date(
                        start_date=start_date,
                        extr_type=self.build_extr_str()):
                    for update in batch_updates:
                        # for update in updates:
                        try:
                            # We could check if update has been processed, but if not we won't get features anyway
                            update_id = update[0]
                            if column_list_sha1s in update[1]:
                                if update_id not in seen_updates:
                                    list_sha1s = update[1][column_list_sha1s]
                                    samples_ids, features = self.indexer.get_features_from_sha1s(
                                        list_sha1s.split(','),
                                        self.build_extr_str())
                                    if features:
                                        # Apply PCA to features here to save memory
                                        if lopq_pca_model:
                                            np_features = lopq_pca_model.apply_PCA(
                                                np.asarray(features))
                                        else:
                                            np_features = np.asarray(features)
                                        log_msg = "[{}: log] Got features {} from update {}"
                                        print(
                                            log_msg.format(
                                                self.pp, np_features.shape,
                                                update_id))
                                        sys.stdout.flush()
                                        # just appending like this does not account for duplicates...
                                        # train_features.extend(np_features)
                                        nb_saved_feats = self.save_feats_to_lmbd(
                                            feats_db, samples_ids, np_features)
                                        seen_updates.add(update_id)
                                    else:
                                        if self.verbose > 3:
                                            log_msg = "[{}: log] Did not get features from update {}"
                                            print(
                                                log_msg.format(
                                                    self.pp, update_id))
                                            sys.stdout.flush()
                                    if nb_saved_feats >= nb_features:
                                        done = True
                                        break
                            else:
                                warn_msg = "[{}: warning] Update {} has no list of images associated to it."
                                print(warn_msg.format(self.pp, update_id))
                                sys.stdout.flush()
                        except Exception as inst:
                            from cufacesearch.common.error import full_trace_error
                            err_msg = "[{}: error] Failed to get features: {} {}"
                            full_trace_error(
                                err_msg.format(self.pp, type(inst), inst))
                            sys.stdout.flush()
                        else:
                            if self.verbose > 4:
                                print(
                                    "[{}: log] Got {} training samples so far..."
                                    .format(self.pp, nb_saved_feats))
                                sys.stdout.flush()
                        if done:
                            nb_features_to_read = nb_saved_feats
                            break
                else:
                    if not done:
                        # Wait for new updates...
                        # TODO: could be optional
                        if self.wait_for_nbtrain:
                            if nb_saved_feats >= nb_min_train:
                                log_msg = "[{}: log] Gathered minimum number of training features ({})..."
                                print(log_msg.format(self.pp, nb_min_train))
                                sys.stdout.flush()
                                break
                            else:
                                log_msg = "[{}: log] Waiting for new updates. Got {} training samples so far..."
                                print(log_msg.format(self.pp, nb_saved_feats))
                                sys.stdout.flush()
                                time.sleep(60)
                        else:
                            log_msg = "[{}: log] Gathered all available features ({})..."
                            print(
                                log_msg.format(
                                    self.pp,
                                    self.get_nb_saved_feats(feats_db)))
                            sys.stdout.flush()
                            break

        return self.get_feats_from_lmbd(feats_db, nb_features_to_read, dtype)
Example #7
0
  def load_codes(self, full_refresh=False):
    """Load codes

    :param full_refresh: wheter to perform a full refresh or not
    :type full_refresh: bool
    """
    # For multi-workers setting with gunicorn
    self.set_pp(pp="SearcherLOPQHBase." + str(os.getpid()))

    # Calling this method can also perfom an update of the index
    if not self.searcher:
      info_msg = "[{}.load_codes: info] Not loading codes as searcher is not initialized."
      print(info_msg.format(self.pp))
      return

    start_load = time.time()
    total_compute_time = 0

    try:
      # try to get date of last update
      start_date = "1970-01-01"
      if not full_refresh:
        start_date = self.get_latest_update_suffix()
      extr_str = self.build_extr_str()
      feat_size = get_feat_size(self.featurizer_type)
      #feat_type = get_feat_dtype(self.featurizer_type)
      feat_type = self.featurizer_type

      # Get all updates ids for the extraction type
      # TODO: this scan makes the API unresponsive for ~2 minutes during the update process...
      for batch_updates in self.indexer.get_updates_from_date(start_date=start_date,
                                                              extr_type=extr_str):
        for update in batch_updates:
          update_id = update[0]
          # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot?
          if self.is_update_indexed(update_id) and not full_refresh:
            if self.verbose > 4:
              print("[{}: log] Skipping update {} already indexed.".format(self.pp, update_id))
              continue
          else:
            dtn = datetime.now()
            if self.is_update_processed(update[1]) and not self.skip_update(update_id, dtn):
              print("[{}: log] Looking for codes of update {}".format(self.pp, update_id))
              # Get this update codes
              codes_string = self.build_codes_string(update_id)
              try:
                # Check for precomputed codes
                codes_dict = self.storer.load(codes_string, silent=True)
                if codes_dict is None:
                  msg = "[{}: log] Could not load codes from {}"
                  raise ValueError(msg.format(self.pp, codes_string))
                # If full_refresh, check that we have as many codes as available features
                if full_refresh:
                  # Also check for 'completed' flag?
                  if self.indexer.get_col_listsha1s() in update[1]:
                    set_sha1s = set(update[1][self.indexer.get_col_listsha1s()].split(','))
                    sids, _ = self.indexer.get_features_from_sha1s(list(set_sha1s), extr_str)
                    if len(set(sids)) > len(codes_dict):
                      msg = "[{}: log] Update {} has {} new features"
                      diff_count = len(set(sids)) - len(codes_dict)
                      raise ValueError(msg.format(self.pp, update_id, diff_count))
                    else:
                      msg = "[{}: log] Skipping update {} indexed with all {}/{} features"
                      print(msg.format(self.pp, update_id, len(codes_dict), len(set(sids))))
                      miss_extr = self.indexer.get_missing_extr_sha1s(list(set_sha1s), extr_str,
                                                                      skip_failed=self.skipfailed)
                      # If all sha1s have been processed, no need to ever check that update again
                      # Store that information as future date_db to avoid ever checking again...
                      if not miss_extr and self.lopq_searcher == "LOPQSearcherLMDB":
                        dtn = dtn.replace(year=9999)
              except Exception as inst:
                # Update codes not available
                if self.verbose > 3:
                  print(inst)
                # Compute codes for update not yet processed and save them
                start_compute = time.time()
                # Get detections (if any) and features
                if self.indexer.get_col_listsha1s() in update[1]:
                  list_sha1s = list(set(update[1][self.indexer.get_col_listsha1s()].split(',')))
                  sids, fts = self.indexer.get_features_from_sha1s(list_sha1s, extr_str, feat_type)
                  if fts:
                    if fts[0].shape[-1] != feat_size:
                      msg = "[{}.load_codes: error] Invalid feature size {} vs {} expected"
                      raise ValueError(msg.format(fts[0].shape[-1], feat_size))
                    codes_dict = self.compute_codes(sids, fts, codes_string)
                    update_compute_time = time.time() - start_compute
                    total_compute_time += update_compute_time
                    if self.verbose > 0:
                      log_msg = "[{}: log] Update {} codes computation done in {}s"
                      print(log_msg.format(self.pp, update_id, update_compute_time))
                  else:
                    print("[{}: warning] Update {} has no features.".format(self.pp, update_id))
                    continue
                else:
                  print("[{}: warning] Update {} has no list of images.".format(self.pp, update_id))
                  continue

              # Use new method add_codes_from_dict of searcher
              self.searcher.add_codes_from_dict(codes_dict)
              self.add_update(update_id, date_db=dtn)

      total_load = time.time() - start_load
      self.last_refresh = datetime.now()

      print("[{}: log] Total udpates computation time is: {}s".format(self.pp, total_compute_time))
      print("[{}: log] Total udpates loading time is: {}s".format(self.pp, total_load))
      # Total udpates loading time is: 0.0346581935883s, really? Seems much longer

    except Exception as inst:
      full_trace_error("[{}: error] Could not load codes. {}".format(self.pp, inst))
    def get_batch_kafka(self):
        """Get one batch of images from Kafka

    :yield: tuple (rows_batch, update_id)
    """
        # Read from a kafka topic to allow safer parallelization on different machines
        # DONE: use in_indexer
        img_cols = [
            self.in_indexer.get_col_imgbuff(),
            self.in_indexer.get_col_imgurlbak(), self.img_column
        ]
        try:
            # Needs to read topic to get update_id and list of sha1s
            if self.ingester.consumer:
                for msg in self.ingester.consumer:
                    msg_dict = json.loads(msg.value)
                    update_id = msg_dict.keys()[0]
                    # NB: Try to get update info and check it was really not processed yet.
                    if self.is_update_unprocessed(update_id):
                        str_list_sha1s = msg_dict[update_id]
                        list_sha1s = str_list_sha1s.split(',')
                        msg = "[{}.get_batch_kafka: log] Update {} has {} images."
                        print(msg.format(self.pp, update_id, len(list_sha1s)))
                        if self.verbose > 3:
                            msg = "[{}.get_batch_kafka: log] Looking for columns: {}"
                            print(msg.format(self.pp, img_cols))
                        # DONE: use in_indexer
                        rows_batch = self.in_indexer.get_columns_from_sha1_rows(
                            list_sha1s, columns=img_cols)
                        #print "rows_batch", rows_batch
                        if rows_batch:
                            if self.verbose > 4:
                                msg = "[{}.get_batch_kafka: log] Yielding for update: {}"
                                print(msg.format(self.pp, update_id))
                            yield rows_batch, update_id
                            self.ingester.consumer.commit()
                            if self.verbose > 4:
                                msg = "[{}.get_batch_kafka: log] After yielding for update: {}"
                                print(msg.format(self.pp, update_id))
                            self.last_update_date_id = '_'.join(
                                update_id.split('_')[-2:])
                        # Should we try to commit offset only at this point?
                        else:
                            msg = "[{}.get_batch_kafka: log] Did not get any image buffers for the update: {}"
                            print(msg.format(self.pp, update_id))
                    else:
                        msg = "[{}.get_batch_kafka: log] Skipping already processed update: {}"
                        print(msg.format(self.pp, update_id))
                else:
                    print("[{}.get_batch_kafka: log] No update found.".format(
                        self.pp))
                    # Fall back to checking HBase for unstarted/unfinished updates
                    for rows_batch, update_id in self.get_batch_hbase():
                        yield rows_batch, update_id
            else:
                print("[{}.get_batch_kafka: log] No consumer found.".format(
                    self.pp))
                # Fall back to checking HBase for unstarted/unfinished updates
                for rows_batch, update_id in self.get_batch_hbase():
                    yield rows_batch, update_id
        except Exception as inst:
            full_trace_error("[{}.get_batch_kafka: error] {}".format(
                self.pp, inst))
Example #9
0
    def get_batch_hbase(self):
        """Get one batch of images from HBase

    :yield: tuple (rows_batch, update_id)
    """
        # legacy implementation: better to have a kafka topic for batches to be processed to allow
        # safe and efficient parallelization on different machines
        # DONE: use in_indexer
        img_cols = [
            self.in_indexer.get_col_imgbuff(),
            self.in_indexer.get_col_imgurlbak(), self.img_column
        ]
        try:
            # DONE: use out_indexer
            for updates in self.out_indexer.get_unprocessed_updates_from_date(
                    self.last_update_date_id, extr_type=self.extr_prefix):
                for update_id, update_cols in updates:
                    if self.extr_prefix in update_id:
                        # double check update has not been processed somewhere else
                        if self.is_update_unprocessed(update_id):
                            # double check update was not marked as started recently i.e. by another process
                            if self.is_update_notstarted(
                                    update_id, max_delay=TIME_ELAPSED_FAILED):
                                # DONE: use out_indexer
                                list_sha1s = update_cols[
                                    self.out_indexer.get_col_listsha1s(
                                    )].split(',')
                                msg = "[{}.get_batch_hbase: log] Update {} has {} images."
                                print(
                                    msg.format(self.pp, update_id,
                                               len(list_sha1s)))
                                # We should time that, it seems slow i.e. 2/3 minutes per update.
                                try:
                                    rows_batch = self.in_indexer.get_columns_from_sha1_rows(
                                        list_sha1s,
                                        rbs=BATCH_SIZE_IMGBUFFER,
                                        columns=img_cols)
                                except Exception:
                                    msg = "[{}.get_batch_hbase: warning] Failed retrieving images data for update: {}"
                                    print(msg.format(self.pp, update_id))
                                    # flush?
                                    sys.stdout.flush()
                                    # Update self.last_update_date_id ?
                                    #self.last_update_date_id = '_'.join(update_id.split('_')[-2:])
                                    continue
                                # print "rows_batch", rows_batch
                                if rows_batch:
                                    yield rows_batch, update_id
                                else:
                                    msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}"
                                    print(msg.format(self.pp, update_id))
                                    #msg = "[{}.get_batch_hbase: log] Was trying to read columns {} from table {} for rows {}"
                                    #print(msg.format(self.pp, img_cols, self.in_indexer.table_sha1infos_name, list_sha1s))
                                # Store last update id
                                self.last_update_date_id = '_'.join(
                                    update_id.split('_')[-2:])
                            else:
                                msg = "[{}.get_batch_hbase: log] Skipping update started recently: {}"
                                print(msg.format(self.pp, update_id))
                                continue
                        else:
                            msg = "[{}.get_batch_hbase: log] Skipping already processed update: {}"
                            print(msg.format(self.pp, update_id))
                            continue
                    else:
                        if self.verbose > 6:
                            msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type."
                            print(msg.format(self.pp, update_id))
            else:
                print("[{}.get_batch_hbase: log] No unprocessed update found.".
                      format(self.pp))
                # Should we reinitialized self.last_update_date_id?
                # Look for updates that have some unprocessed images
                # TODO: wether we do that or not could be specified by a parameter
                # as this induces slow down during update...
                # DONE: use out_indexer
                count_ucme = 0
                stop_cme = False
                for updates in self.out_indexer.get_missing_extr_updates_from_date(
                        self.last_missing_extr_date,
                        extr_type=self.extr_prefix):
                    for update_id, update_cols in updates:
                        if self.extr_prefix in update_id:
                            # DONE: use out_indexer
                            if self.out_indexer.get_col_listsha1s(
                            ) in update_cols:
                                list_sha1s = update_cols[
                                    self.out_indexer.get_col_listsha1s(
                                    )].split(',')
                                msg = "[{}.get_batch_hbase: log] Update {} has {} images missing extractions."
                                print(
                                    msg.format(self.pp, update_id,
                                               len(list_sha1s)))
                                sys.stdout.flush()
                                # also get 'ext:' to check if extraction was already processed?
                                # DONE: use in_indexer
                                rows_batch = self.in_indexer.get_columns_from_sha1_rows(
                                    list_sha1s,
                                    rbs=BATCH_SIZE_IMGBUFFER,
                                    columns=img_cols)
                                if rows_batch:
                                    yield rows_batch, update_id
                                    self.last_missing_extr_date = '_'.join(
                                        update_id.split('_')[-2:])
                                    count_ucme += 1
                                    if count_ucme >= self.maxucme:
                                        stop_cme = True
                                        break
                                else:
                                    msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}"
                                    print(msg.format(self.pp, update_id))
                            else:
                                msg = "[{}.get_batch_hbase: log] Update {} has no images list."
                                print(msg.format(self.pp, update_id))
                        else:
                            msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type."
                            print(msg.format(self.pp, update_id))
                    # We have reached maximum number of check for missing extractions in one call
                    if stop_cme:
                        break
                else:
                    if stop_cme:
                        msg = "[{}.get_batch_hbase: log] Stopped checking updates with missing extractions"
                        msg += "after founding {}/{}."
                        print(
                            msg.format(self.pp, count_ucme, self.maxucme,
                                       self.last_missing_extr_date))
                        msg = "[{}.get_batch_hbase: log] Will restart next time from: {}"
                        print(msg.format(self.pp, self.last_missing_extr_date))
                        sys.stdout.flush()
                    else:
                        msg = "[{}.get_batch_hbase: log] No updates with missing extractions found."
                        print(msg.format(self.pp))
                        sys.stdout.flush()
                        # Re-initialize dates just to make sure we don't miss anything
                        self.last_update_date_id = "1970-01-01"
                        self.last_missing_extr_date = "1970-01-01"

        except Exception as inst:
            # If we reach this point it is really a succession of failures
            full_trace_error("[{}.get_batch_hbase: error] {}".format(
                self.pp, inst))
            # Raise Exception to restart process or docker
            raise inst
    parser.add_argument("-p", "--port", dest="port", type=int, default=5000)
    parser.add_argument("-e", "--endpoint", dest="endpoint", type=str, default="cuimgsearch")
    options = parser.parse_args()

    # Load config file
    print("[API: log] Setting conf file to: {}".format(options.conf_file))
    global_conf = json.load(open(options.conf_file, 'rt'))

    # Initialize searcher object only once
    while True:
        try:
            api.global_searcher = searcher_lopqhbase.SearcherLOPQHBase(global_conf)
            break
        except Exception as inst:
            err_msg = "Failed to initialized searcher ({}): {}".format(type(inst), inst)
            from cufacesearch.common.error import full_trace_error
            full_trace_error(err_msg)
            time.sleep(60)
    api.global_start_time = datetime.now()
    api.input_type = api.global_searcher.input_type

    # Start API
    searchapi.add_resource(api.APIResponder, '/'+options.endpoint+'/<string:mode>')
    print("[API: log] Starting Search API on port {} with endpoint '{}'".format(options.port, options.endpoint))
    sys.stdout.flush()
    from gevent.wsgi import WSGIServer
    http_server = WSGIServer(('', options.port), app)
    http_server.serve_forever()