class ExtractionChecker(ConfReader):
  """ExtractionChecker class
  """

  def __init__(self, global_conf, prefix=DEFAULT_EXTR_CHECK_PREFIX, pid=None):
    """ExtractionChecker constructor

    :param global_conf_in: configuration file or dictionary
    :type global_conf_in: str, dict
    :param prefix: prefix in configuration
    :type prefix: str
    :param pid: process id
    :type pid: int
    """
    self.list_extr_prefix = []
    self.pid = pid
    self.dict_sha1_infos = dict()

    super(ExtractionChecker, self).__init__(global_conf, prefix)

    self.last_push = time.time()
    self.nb_imgs_check = 0
    self.nb_imgs_unproc = 0
    self.nb_imgs_unproc_lastprint = 0

    self.featurizer_type = self.get_required_param("featurizer_type")
    self.detector_type = self.get_required_param("detector_type")
    self.input_type = self.get_required_param("input_type")

    # Max delay
    self.max_delay = int(self.get_param("max_delay", default=3600))

    self.list_extr_prefix = [self.featurizer_type, "feat", self.detector_type, self.input_type]
    self.extr_prefix = "_".join(self.list_extr_prefix)
    self.batch_check_column = None
    self.check_columns = []

    # changed to: get column family from indexer in set_check_columns
    # Need to be build from extraction type and detection input + "_processed"
    #self.extr_family_column = self.get_param("extr_family_column", default="ext")
    # self.extr_prefix_base_column_name = self.extr_family_column + ":" + self.extr_prefix
    # self.extr_check_column = self.extr_prefix_base_column_name + "_processed"
    # # Need to be build from extraction type and extraction input + "_batchid"
    # self.batch_check_column = self.extr_prefix_base_column_name + "_updateid"
    # self.check_columns = [self.extr_check_column, self.batch_check_column]

    self.set_pp()

    # Initialize indexer
    self.indexer = HBaseIndexerMinimal(self.global_conf,
                                       prefix=self.get_required_param("indexer_prefix"))
    self.indexer.pp = "CheckerHBase"
    print(self.get_required_param("indexer_prefix"), self.indexer.get_dictcf_sha1_table())
    self.set_check_columns()
    print(self.check_columns)
    # Initialize ingester
    try:
      self.ingester = GenericKafkaProcessor(self.global_conf,
                                            prefix=self.get_required_param("check_ingester_prefix"))
    except Exception as inst:
      # print "Could not initialize checker, sleeping for {}s.".format(self.max_delay)
      # time.sleep(self.max_delay)
      # raise(inst)
      #print("Could not initialize 'updates_out_topic' ({}). Will write only to HBase.".format(inst))
      print("[{}: ERROR] Could not start ingester.".format(self.pp, inst))
      raise inst
    # This will not be set for HBase processing, but checker would keep dying here...
    self.updates_out_topic = None
    try:
      self.updates_out_topic = self.ingester.get_required_param("producer_updates_out_topic")
    except Exception as inst:
      # print "Could not initialize checker, sleeping for {}s.".format(self.max_delay)
      # time.sleep(self.max_delay)
      # raise(inst)
      #print("Could not initialize 'updates_out_topic' ({}). Will write only to HBase.".format(inst))
      print("{}. Will write only to HBase.".format(inst))

    self.ingester.pp = "ec"
    if self.pid:
      self.ingester.pp += str(self.pid)

  def set_check_columns(self):
    """Set columns to be checked in indexer
    """
    # changed to: get column family from indexer
    extr_prefix_base_column_name = self.indexer.extrcf + ":" + self.extr_prefix
    extr_check_column = extr_prefix_base_column_name + "_processed"
    # Need to be build from extraction type and extraction input + "_batchid"
    self.batch_check_column = extr_prefix_base_column_name + "_updateid"
    self.check_columns = [extr_check_column, self.batch_check_column]
    #print(self.check_columns)


  def set_pp(self, pp=""):
    """Set pretty name
    """
    self.pp = "ExtractionChecker"
    self.pp += "-".join(self.list_extr_prefix)
    if self.pid:
      self.pp += "." + str(self.pid)

  def store_img_infos(self, msg):
    """Store information about the images of ``msg`` in ``self.dict_sha1_infos``

    :param msg: Kafka record
    :type msg: collections.namedtuple
    """
    # msg is technically a ConsumerRecord that is a collections.namedtuple, see:
    # https://github.com/dpkp/kafka-python/blob/master/kafka/consumer/fetcher.py#L30
    strk = str(msg['sha1'])
    self.dict_sha1_infos[strk] = dict()
    for key in msg:
      # dumps json of 'img_info'
      # We actually need that only for DIG...
      if key == "img_info":
        self.dict_sha1_infos[strk][key] = json.dumps(msg[key])
      else:
        # discard 'img_buffer' (if it exists?...), and 'sha1'
        # if k != "img_buffer" and k != "sha1":
        #  self.dict_sha1_infos[strk][k] = msg[k]
        # discard 'sha1'
        if key != "sha1":
          self.dict_sha1_infos[strk][key] = msg[key]

  def cleanup_dict_infos(self, list_del_sha1s):
    """Remove images ``list_del_sha1s`` from ``self.dict_sha1_infos``

    :param list_del_sha1s: list of images sha1 to remove
    :type list_del_sha1s: list
    """
    for sha1 in list_del_sha1s:
      try:
        del self.dict_sha1_infos[str(sha1)]
      except:
        # could happen when cleaning up duplicates or image processed by another process
        pass

  def get_dict_push(self, list_get_sha1s, daemon=False):
    """Get dictionary to be pushed to HBase for images in ``list_get_sha1s``

    :param list_get_sha1s: list of images
    :type list_get_sha1s: list
    :param daemon: whether the checker is running in daemon mode
    :type daemon: bool
    :return: (dict_push, update_id)
    :rtype: tuple
    """
    #TODO: is this needed for every get_dict_push call?
    self.set_check_columns()
    # TODO: also pass current update_id, and move the creation of update id out of this method
    #  this method should actually be used to 'claim' an image as soon as we can.
    dict_push = dict()
    # append processid to 'update_id' for safe use with multiple consumers, even after restart
    # /!\ beware, it should not contain underscores
    tmp_update_id, _ = self.indexer.get_next_update_id(today=None, extr_type=self.extr_prefix)
    update_id = tmp_update_id + '-' + self.ingester.pp + '-' + str(time.time())
    for sha1 in list_get_sha1s:
      dict_push[str(sha1)] = dict()
      try:
        tmp_dict = self.dict_sha1_infos[str(sha1)]
      except:
        # This would mean the image has been marked as part of another batch by another process,
        # and thus deleted in a previous 'get_unprocessed_rows' call
        # This is also only relevant if we run on Daemon mode...
        # TODO: for transition we won't really have any info to push except the update_id...
        if daemon:
          del dict_push[str(sha1)]
          continue
      # build column names properly i.e. appending 'info:'
      for key in tmp_dict:
        # changed to: use column_family from indexer
        # But the use of 'key' here also means we rely on the input to define column name...
        #dict_push[str(sha1)]['info:' + key] = tmp_dict[key]
        dict_push[str(sha1)][self.indexer.imginfocf + ':' + key] = tmp_dict[key]
      dict_push[str(sha1)][self.batch_check_column] = update_id
    return dict_push, update_id

  def get_unprocessed_rows(self, list_check_sha1s):
    """Get the subset of the list of sha1s ``list_check_sha1s`` that have not been processed yet

    :param list_check_sha1s: list of images sha1 to check
    :type list_check_sha1s: list
    :return: set of unprocessed images
    :rtype: set
    """
    # TODO: also pass current update_id and only delete if != from current update...

    unprocessed_rows = set(list_check_sha1s)

    if list_check_sha1s:
      # Check if the selected sha1 rows in HBase table 'sha1infos' have those check_column
      # This call will only return rows that DO have those check_column
      fam = self.indexer.get_dictcf_sha1_table()
      try:
        sha1s_rows = self.indexer.get_columns_from_sha1_rows(list_check_sha1s, self.check_columns,
                                                             families=fam)
      except Exception as inst:
        print("[{}.get_unprocessed_rows: log] fam: {}".format(self.pp, fam))
        raise inst

          #families=self.tablesha1_col_families)
      if sha1s_rows:
        # TODO: only delete if really previously processed, i.e. if != from current update...
        found_sha1_rows = set([str(row[0]) for row in sha1s_rows])
        # Clean up 'dict_sha1_infos' deleting found_sha1_rows
        self.cleanup_dict_infos(found_sha1_rows)
        set_list_check_sha1s = set(list_check_sha1s)
        # TODO: but we should not re-add them, so we should discard them from unprocessed_rows
        unprocessed_rows = set_list_check_sha1s - found_sha1_rows

    return unprocessed_rows

  def run(self, daemon=False):
    """Run extraction checker

    :param daemon: whether we are running in daemon mode
    :type daemon: bool
    :raises Exception: if check fails
    """
    i = 0
    try:
      list_sha1s_to_process = []
      # TODO: create update_id here

      while True:
        list_check_sha1s = []

        try:
          # Accumulate images infos
          for msg_json in self.ingester.consumer:
            msg = json.loads(msg_json.value)
            # i += 1
            # print((i, len(list_check_sha1s), msg))

            # msg could now contain keys 'sha1' or 'list_sha1s'
            # should we check that we can't have both or other keys?...
            if 'sha1' in msg:
              list_check_sha1s.append(str(msg['sha1']))
              # Store other fields to be able to push them too
              self.store_img_infos(msg)
            elif 'list_sha1s' in msg:
              for sha1 in msg['list_sha1s']:
                list_check_sha1s.append(str(sha1))
                # We won't have any additional infos no?
                # But we should still build a dict for each sample for consistency...
                tmp_dict = dict()
                tmp_dict['sha1'] = str(sha1)
                # will basically push an empty dict to self.dict_sha1_infos, so self.get_dict_push
                # works properly later on...
                self.store_img_infos(tmp_dict)
            else:
              print('Unknown keys in msg: {}'.format(msg.keys()))

            if len(list_check_sha1s) >= self.indexer.batch_update_size:
              break
        except Exception as inst:
          # trying to use 'consumer_timeout_ms' to raise timeout and get last samples
          msg = "[{}: warning] At {}, caught {} {} in consumer loop"
          now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S')
          print(msg.format(self.pp, now_str, type(inst), inst))
          sys.stdout.flush()

        if not list_check_sha1s:
          # TODO: should we fallback to scanning Hbase table here?
          continue

        # Check which images have not been processed (or pushed in an update) yet
        unprocessed_rows = self.get_unprocessed_rows(list_check_sha1s)
        self.nb_imgs_check += len(list_check_sha1s)
        push_delay = (time.time() - self.last_push) > self.max_delay / 60
        if push_delay and self.nb_imgs_unproc_lastprint != self.nb_imgs_unproc:
          msg = "[{}: log] Found {}/{} unprocessed images"
          print(msg.format(self.pp, self.nb_imgs_unproc, self.nb_imgs_check))
          self.nb_imgs_unproc_lastprint = self.nb_imgs_unproc

        # TODO: we should mark those images as being 'owned' by the update we are constructing
        # (only important if we are running multiple threads i.e. daemon is True)
        # otherwise another update running at the same time could also claim it (in another ad)
        # could be handle when adding data to the searcher but duplicates in extraction process...

        # Push sha1s to be processed
        for sha1 in unprocessed_rows:
          list_sha1s_to_process.append(sha1)

        # Remove potential duplicates
        list_sha1s_to_process = list(set(list_sha1s_to_process))

        if list_sha1s_to_process:
          # Push them to HBase by batch of 'batch_update_size'
          push_delay = (time.time() - self.last_push) > self.max_delay
          full_batch = len(list_sha1s_to_process) >= self.indexer.batch_update_size
          if full_batch or (push_delay and list_sha1s_to_process):
            # Trim here to push exactly a batch of 'batch_update_size'
            list_push = list_sha1s_to_process[:min(self.indexer.batch_update_size,
                                                   len(list_sha1s_to_process))]

            # TODO: this should be done before,
            # to 'claim' the images as soon as we plan to process them for this update
            # Gather corresponding sha1 infos
            dict_push, update_id = self.get_dict_push(list_push, daemon=daemon)
            if dict_push:
              self.nb_imgs_unproc += len(dict_push.keys())
              msg = "[{}: at {}] Pushing update {} of {} images."
              now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S')
              print(msg.format(self.pp, now_str, update_id, len(dict_push.keys())))
              sys.stdout.flush()

              # Push images
              fam = self.indexer.get_dictcf_sha1_table()
              if self.verbose > 4:
                msg = "[{}] Pushing images for update {} with fam {}"
                print(msg.format(self.pp, update_id, fam))
              sha1s_table = self.indexer.table_sha1infos_name
              self.indexer.push_dict_rows(dict_push, sha1s_table, families=fam)

              # Build HBase updates dict
              dict_updates_db = dict()
              now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S')
              list_sha1s_col = self.indexer.get_col_listsha1s()
              dict_updates_db[update_id] = {list_sha1s_col: ','.join(dict_push.keys()),
                                            self.indexer.get_col_upcreate(): now_str}
              # Push it
              fam = self.indexer.get_dictcf_update_table()
              if self.verbose > 4:
                msg = "[{}] Pushing update {} info with fam {}"
                print(msg.format(self.pp, update_id, fam))
              self.indexer.push_dict_rows(dict_updates_db, self.indexer.table_updateinfos_name,
                                          families=fam)

              # Build HBase updates dict
              if self.updates_out_topic is not None:
                dict_updates_kafka = dict()
                dict_updates_kafka[update_id] = ','.join(dict_push.keys())
                # Push it
                self.ingester.producer.send(self.updates_out_topic, json.dumps(dict_updates_kafka))

              # Gather any remaining sha1s and clean up infos
              if len(list_sha1s_to_process) > self.indexer.batch_update_size:
                list_sha1s_to_process = list_sha1s_to_process[self.indexer.batch_update_size:]
              else:
                list_sha1s_to_process = []
              # if duplicates wrt list_push, remove them. Can this still happen?
              list_sha1s_to_process = [sh1 for sh1 in list_sha1s_to_process if sh1 not in list_push]
              self.cleanup_dict_infos(list_push)
            else:
              msg = "[{}: at {}] Nothing to push for update {}"
              print(msg.format(self.pp, datetime.now().strftime('%Y-%m-%d:%H.%M.%S'), update_id))
              sys.stdout.flush()
            self.last_push = time.time()
            # TODO: we should create a new update_id here,
            # and let it claim the potential remaining images in 'list_sha1s_to_process'
            # sanity check that len(list_sha1s_to_process) == len(self.dict_sha1_infos) ?

    except Exception as inst:
      exc_type, exc_obj, exc_tb = sys.exc_info()
      fulltb = traceback.format_tb(exc_tb)
      raise type(inst)(" {} ({})".format(inst, ''.join(fulltb)))
class ExtractionProcessor(ConfReader):
    """ExtractionProcessor class
  """
    def __init__(self, global_conf, prefix=DEFAULT_EXTR_PROC_PREFIX):
        """ExtractionProcessor constructor

    :param global_conf_in: configuration file or dictionary
    :type global_conf_in: str, dict
    :param prefix: prefix in configuration
    :type prefix: str
    """
        self.extractor = None
        self.nb_empt = 0
        self.nb_err = 0
        self.max_proc_time = 1200  # in seconds. Increased for sbcmdline...
        self.url_input = True

        super(ExtractionProcessor, self).__init__(global_conf, prefix)

        # TODO: move that to self.read_conf()
        # Get required parameters
        self.input_type = self.get_required_param("input_type")
        self.nb_threads = self.get_required_param("nb_threads")
        self.featurizer_type = self.get_required_param("featurizer_type")
        self.featurizer_prefix = self.get_required_param("featurizer_prefix")
        self.detector_type = self.get_required_param("detector_type")

        # Get optional parameters
        self.verbose = int(self.get_param("verbose", default=0))
        self.ingestion_input = self.get_param("ingestion_input",
                                              default="kafka")
        self.push_back = self.get_param("push_back", default=False)
        file_input = self.get_param("file_input")
        print("[{}.ExtractionProcessor: log] file_input: {}".format(
            self.pp, file_input))
        if file_input:
            self.url_input = False
        print("[{}.ExtractionProcessor: log] url_input: {}".format(
            self.pp, self.url_input))

        # Means we extract feature from the whole image
        if self.detector_type == "full":
            self.detector = None

        self.extr_prefix = build_extr_str(self.featurizer_type,
                                          self.detector_type, self.input_type)
        self.set_pp()

        # Initialize queues
        self.init_queues()

        # Initialize indexer
        # We now have two indexers:
        # - one "in_indexer" for TF table with buffer, img URLs etc...
        # - one "out_indexer" for our table with extractions etc
        # NB: they could be the same if tables are merged...
        self.out_indexer = HBaseIndexerMinimal(
            self.global_conf, prefix=self.get_required_param("indexer_prefix"))
        prefix_in_indexer = self.get_param("in_indexer_prefix", default=False)
        if prefix_in_indexer:
            self.in_indexer = HBaseIndexerMinimal(self.global_conf,
                                                  prefix=prefix_in_indexer)
            insha1tab = self.in_indexer.table_sha1infos_name
            insha1cfs = self.in_indexer.get_dictcf_sha1_table()
            print("[{}] 'in_indexer' sha1 table {} columns are: {}".format(
                self.pp, insha1tab, insha1cfs))
        else:
            print(
                "[{}] empty 'in_indexer_prefix', using out_indexer as in_indexer too."
                .format(self.pp))
            self.in_indexer = self.out_indexer

        # Initialize extractors only once (just one first)
        self.extractors = []
        # DONE: use 'out_indexer'
        self.extractors.append(
            GenericExtractor(self.detector_type, self.featurizer_type,
                             self.input_type, self.out_indexer.extrcf,
                             self.featurizer_prefix, self.global_conf))

        # DONE: use 'in_indexer'
        if self.url_input:
            self.img_column = self.in_indexer.get_col_imgurl()
        else:
            self.img_column = self.in_indexer.get_col_imgpath()
        img_cols = [
            self.in_indexer.get_col_imgbuff(),
            self.in_indexer.get_col_imgurlbak(), self.img_column
        ]
        print("[{}.ExtractionProcessor: log] img_cols: {}".format(
            self.pp, img_cols))

        self.last_update_date_id = ''

        # Initialize ingester
        self.ingester = GenericKafkaProcessor(
            self.global_conf,
            prefix=self.get_required_param("proc_ingester_prefix"))
        self.ingester.pp = "ep"

    def set_pp(self, pp="ExtractionProcessor"):
        """Set pretty name

    :param pp: pretty name prefix
    :type pp: str
    """
        self.pp = pp
        if self.extractor:
            self.pp += "_" + self.extr_prefix

    def init_queues(self):
        """Initialize queues list ``self.q_in`` and ``self.q_out``
    """
        from multiprocessing import JoinableQueue
        self.q_in = []
        self.q_out = []
        for _ in range(self.nb_threads):
            self.q_in.append(JoinableQueue(0))
            self.q_out.append(JoinableQueue(0))

    # Should these two methods be in indexer?
    def is_update_unprocessed(self, update_id):
        """Check if an update was not processed yet

    :param update_id: update id
    :type update_id: str
    :return: boolean indicated if update ``update_id`` is unprocessed
    :rtype: bool
    """
        # DONE: use out_indexer
        update_rows = self.out_indexer.get_rows_by_batch(
            [update_id], table_name=self.out_indexer.table_updateinfos_name)
        if update_rows:
            for row in update_rows:
                if self.out_indexer.get_col_upproc() in row[1]:
                    return False
        return True

    def is_update_notstarted(self, update_id, max_delay=None):
        """Check if an update was not started yet

    :param update_id: update id
    :type update_id: str
    :param max_delay: delay (in seconds) between marked start time and now to consider update failed
    :type max_delay: int
    :return: boolean
    :rtype: bool
    """
        # DONE: use out_indexer
        update_rows = self.out_indexer.get_rows_by_batch(
            [update_id], table_name=self.out_indexer.table_updateinfos_name)
        if update_rows:
            for row in update_rows:
                # changed to: self.column_update_started
                #if info_column_family+":"+update_str_started in row[1]:
                #if self.column_update_started in row[1]:
                # DONE: use out_indexer
                if self.out_indexer.get_col_upstart() in row[1]:
                    if max_delay:
                        start_str = row[1][self.out_indexer.get_col_upstart()]
                        # start time format is '%Y-%m-%d:%H.%M.%S'
                        start_dt = datetime.strptime(start_str,
                                                     '%Y-%m-%d:%H.%M.%S')
                        now_dt = datetime.now()
                        diff_dt = now_dt - start_dt
                        if diff_dt.total_seconds() > max_delay:
                            return True
                    return False
        return True

    def get_batch_hbase(self):
        """Get one batch of images from HBase

    :yield: tuple (rows_batch, update_id)
    """
        # legacy implementation: better to have a kafka topic for batches to be processed to allow
        # safe and efficient parallelization on different machines
        # DONE: use in_indexer
        img_cols = [
            self.in_indexer.get_col_imgbuff(),
            self.in_indexer.get_col_imgurlbak(), self.img_column
        ]
        try:
            # DONE: use out_indexer
            for updates in self.out_indexer.get_unprocessed_updates_from_date(
                    self.last_update_date_id, extr_type=self.extr_prefix):
                for update_id, update_cols in updates:
                    if self.extr_prefix in update_id:
                        # double check update has not been processed somewhere else
                        if self.is_update_unprocessed(update_id):
                            # double check update was not marked as started recently i.e. by another process
                            if self.is_update_notstarted(
                                    update_id, max_delay=TIME_ELAPSED_FAILED):
                                # DONE: use out_indexer
                                list_sha1s = update_cols[
                                    self.out_indexer.get_col_listsha1s(
                                    )].split(',')
                                msg = "[{}.get_batch_hbase: log] Update {} has {} images."
                                print(
                                    msg.format(self.pp, update_id,
                                               len(list_sha1s)))
                                # also get 'ext:' to check if extraction was already processed?
                                # DONE: use in_indexer
                                rows_batch = self.in_indexer.get_columns_from_sha1_rows(
                                    list_sha1s, columns=img_cols)
                                # print "rows_batch", rows_batch
                                if rows_batch:
                                    yield rows_batch, update_id
                                    self.last_update_date_id = '_'.join(
                                        update_id.split('_')[-2:])
                                else:
                                    msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}"
                                    print(msg.format(self.pp, update_id))
                            else:
                                msg = "[{}.get_batch_hbase: log] Skipping update started recently: {}"
                                print(msg.format(self.pp, update_id))
                                continue
                        else:
                            msg = "[{}.get_batch_hbase: log] Skipping already processed update: {}"
                            print(msg.format(self.pp, update_id))
                            continue
                    else:
                        if self.verbose > 6:
                            msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type."
                            print(msg.format(self.pp, update_id))
            else:
                print("[{}.get_batch_hbase: log] No unprocessed update found.".
                      format(self.pp))
                # Look for updates that have some unprocessed images
                # TODO: wether we do that or not could be specified by a parameter
                # as this induces slow down during update...
                # DONE: use out_indexer
                for updates in self.out_indexer.get_missing_extr_updates_from_date(
                        "1970-01-01", extr_type=self.extr_prefix):
                    for update_id, update_cols in updates:
                        if self.extr_prefix in update_id:
                            # DONE: use out_indexer
                            if self.out_indexer.get_col_listsha1s(
                            ) in update_cols:
                                list_sha1s = update_cols[
                                    self.out_indexer.get_col_listsha1s(
                                    )].split(',')
                                msg = "[{}.get_batch_hbase: log] Update {} has {} images missing extractions."
                                print(
                                    msg.format(self.pp, update_id,
                                               len(list_sha1s)))
                                sys.stdout.flush()
                                # also get 'ext:' to check if extraction was already processed?
                                # DONE: use in_indexer
                                rows_batch = self.in_indexer.get_columns_from_sha1_rows(
                                    list_sha1s, columns=img_cols)
                                if rows_batch:
                                    yield rows_batch, update_id
                                else:
                                    msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}"
                                    print(msg.format(self.pp, update_id))
                            else:
                                msg = "[{}.get_batch_hbase: log] Update {} has no images list."
                                print(msg.format(self.pp, update_id))
                        else:
                            msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type."
                            print(msg.format(self.pp, update_id))
                else:
                    msg = "[{}.get_batch_hbase: log] No updates with missing extractions found."
                    print(msg.format(self.pp))
                    sys.stdout.flush()

        except Exception as inst:
            full_trace_error("[{}.get_batch_hbase: error] {}".format(
                self.pp, inst))

    def get_batch_kafka(self):
        """Get one batch of images from Kafka

    :yield: tuple (rows_batch, update_id)
    """
        # Read from a kafka topic to allow safer parallelization on different machines
        # DONE: use in_indexer
        img_cols = [
            self.in_indexer.get_col_imgbuff(),
            self.in_indexer.get_col_imgurlbak(), self.img_column
        ]
        try:
            # Needs to read topic to get update_id and list of sha1s
            if self.ingester.consumer:
                for msg in self.ingester.consumer:
                    msg_dict = json.loads(msg.value)
                    update_id = msg_dict.keys()[0]
                    # NB: Try to get update info and check it was really not processed yet.
                    if self.is_update_unprocessed(update_id):
                        str_list_sha1s = msg_dict[update_id]
                        list_sha1s = str_list_sha1s.split(',')
                        msg = "[{}.get_batch_kafka: log] Update {} has {} images."
                        print(msg.format(self.pp, update_id, len(list_sha1s)))
                        if self.verbose > 3:
                            msg = "[{}.get_batch_kafka: log] Looking for columns: {}"
                            print(msg.format(self.pp, img_cols))
                        # DONE: use in_indexer
                        rows_batch = self.in_indexer.get_columns_from_sha1_rows(
                            list_sha1s, columns=img_cols)
                        #print "rows_batch", rows_batch
                        if rows_batch:
                            if self.verbose > 4:
                                msg = "[{}.get_batch_kafka: log] Yielding for update: {}"
                                print(msg.format(self.pp, update_id))
                            yield rows_batch, update_id
                            self.ingester.consumer.commit()
                            if self.verbose > 4:
                                msg = "[{}.get_batch_kafka: log] After yielding for update: {}"
                                print(msg.format(self.pp, update_id))
                            self.last_update_date_id = '_'.join(
                                update_id.split('_')[-2:])
                        # Should we try to commit offset only at this point?
                        else:
                            msg = "[{}.get_batch_kafka: log] Did not get any image buffers for the update: {}"
                            print(msg.format(self.pp, update_id))
                    else:
                        msg = "[{}.get_batch_kafka: log] Skipping already processed update: {}"
                        print(msg.format(self.pp, update_id))
                else:
                    print("[{}.get_batch_kafka: log] No update found.".format(
                        self.pp))
                    # Fall back to checking HBase for unstarted/unfinished updates
                    for rows_batch, update_id in self.get_batch_hbase():
                        yield rows_batch, update_id
            else:
                print("[{}.get_batch_kafka: log] No consumer found.".format(
                    self.pp))
                # Fall back to checking HBase for unstarted/unfinished updates
                for rows_batch, update_id in self.get_batch_hbase():
                    yield rows_batch, update_id
        except Exception as inst:
            full_trace_error("[{}.get_batch_kafka: error] {}".format(
                self.pp, inst))

    def get_batch(self):
        """Get one batch of images

    :yield: tuple (rows_batch, update_id)
    """
        if self.ingestion_input == "hbase":
            for rows_batch, update_id in self.get_batch_hbase():
                yield rows_batch, update_id
        else:
            for rows_batch, update_id in self.get_batch_kafka():
                yield rows_batch, update_id

    def process_batch(self):
        """Process one batch of images

    :raises Exception: if something goes really wrong
    """
        # Get a new update batch
        try:
            for rows_batch, update_id in self.get_batch():
                start_update = time.time()
                print("[{}] Processing update {} of {} rows.".format(
                    self.pp, update_id, len(rows_batch)))
                sys.stdout.flush()

                # Initialize
                self.nb_empt = 0
                self.init_queues()
                threads = []

                # If we have deleted an extractor at some point or for first batch
                nb_extr_to_create = self.nb_threads - len(self.extractors)
                if nb_extr_to_create:
                    start_create_extractor = time.time()
                    while len(self.extractors) < min(self.nb_threads,
                                                     len(rows_batch)):
                        # DONE: use 'out_indexer'
                        self.extractors.append(
                            GenericExtractor(self.detector_type,
                                             self.featurizer_type,
                                             self.input_type,
                                             self.out_indexer.extrcf,
                                             self.featurizer_prefix,
                                             self.global_conf))
                    msg = "[{}] Created {} extractors in {}s."
                    create_extr_time = time.time() - start_create_extractor
                    print(
                        msg.format(self.pp, len(self.extractors),
                                   create_extr_time))

                # Mark batch as started to be process
                now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S')
                # changed to: self.column_update_started
                #dict_val = {info_column_family + ':' + update_str_started: now_str}
                #dict_val = {self.column_update_started: now_str}
                # DONE: use out_indexer
                dict_val = {self.out_indexer.get_col_upstart(): now_str}
                update_started_dict = {update_id: dict_val}
                # DONE: use out_indexer
                self.out_indexer.push_dict_rows(
                    dict_rows=update_started_dict,
                    table_name=self.out_indexer.table_updateinfos_name)

                # TODO: define a get_buffer_images method
                # --------
                # Push images to queue
                list_in = []
                # For parallelized downloading...
                from Queue import Queue
                nb_imgs_dl = 0
                q_in_dl = Queue(0)
                q_out_dl = Queue(0)

                start_get_buffer = time.time()
                # DONE: use in_indexer in all this scope
                # How could we transfer URL from in table to out table if they are different?...
                for img in rows_batch:
                    # should decode base64
                    #if img_buffer_column in img[1]:
                    if self.in_indexer.get_col_imgbuff() in img[1]:
                        # That's messy...
                        b64buffer = buffer_to_B64(
                            cStringIO.StringIO(
                                img[1][self.in_indexer.get_col_imgbuff()]))
                        tup = (img[0], b64buffer, False)
                        list_in.append(tup)
                    else:
                        # need to re-download, accumulate a list of URLs to download
                        # Deal with img_path_column for local_images_kafka_pusher
                        if self.img_column in img[1]:
                            q_in_dl.put((img[0], img[1][self.img_column],
                                         self.push_back))
                            nb_imgs_dl += 1
                        elif self.in_indexer.get_col_imgurlbak() in img[1]:
                            q_in_dl.put(
                                (img[0],
                                 img[1][self.in_indexer.get_col_imgurlbak()],
                                 self.push_back))
                            nb_imgs_dl += 1
                        else:
                            msg = "[{}: warning] No buffer and no URL/path for image {} !"
                            print(msg.format(self.pp, img[0]))
                            continue

                # Download missing images
                nb_dl = 0
                nb_dl_failed = 0
                if nb_imgs_dl > 0:
                    threads_dl = []
                    for i in range(min(self.nb_threads, nb_imgs_dl)):
                        # should read (url, obj_pos) from self.q_in
                        # and push (url, obj_pos, buffer, img_info, start_process, end_process) to self.q_out
                        thread = ThreadedDownloaderBufferOnly(
                            q_in_dl, q_out_dl, url_input=self.url_input)
                        thread.start()
                        threads_dl.append(thread)

                    q_in_dl.join()

                    # Push downloaded images to list_in too
                    while nb_dl < nb_imgs_dl:
                        # This can block?
                        #sha1, buffer, push_back, inst = q_out_dl.get()
                        try:
                            sha1, buffer, push_back, inst = q_out_dl.get(
                                True, 10)
                        except Exception as queue_err:
                            msg = "[{}: error] Download queue out timed out: {}"
                            print(msg.format(self.pp, queue_err))
                            break
                        nb_dl += 1
                        if inst:
                            if self.verbose > 6:
                                msg = "[{}: log] Could not download image {}, error was: {}"
                                print(msg.format(self.pp, sha1, inst))
                            nb_dl_failed += 1
                        else:
                            if buffer:
                                list_in.append((sha1, buffer, push_back))
                            else:
                                # Is that even possible?
                                msg = "[{}: error] No error but no buffer either for image {}"
                                print(msg.format(self.pp, sha1))

                get_buffer_time = time.time() - start_get_buffer
                msg = "[{}] Got {}/{} image buffers ({}/{} downloaded) for update {} in {}s."
                print(
                    msg.format(self.pp, len(list_in), len(rows_batch),
                               nb_dl - nb_dl_failed, nb_dl, update_id,
                               get_buffer_time))
                sys.stdout.flush()
                # --------

                # TODO: define a get_features method
                # --------
                q_batch_size = int(
                    math.ceil(float(len(list_in)) / self.nb_threads))
                for i, q_batch in enumerate(build_batch(list_in,
                                                        q_batch_size)):
                    self.q_in[i].put(q_batch)

                q_in_size = []
                q_in_size_tot = 0
                for i in range(self.nb_threads):
                    q_in_size.append(self.q_in[i].qsize())
                    q_in_size_tot += q_in_size[i]
                if self.verbose > 5:
                    print("[{}] Total input queues sizes is: {}".format(
                        self.pp, q_in_size_tot))

                # Start daemons...
                thread_creation_failed = [0] * self.nb_threads
                for i in range(self.nb_threads):
                    # one per non empty input queue
                    if q_in_size[i] > 0:
                        try:
                            thread = DaemonBatchExtractor(self.extractors[i],
                                                          self.q_in[i],
                                                          self.q_out[i],
                                                          verbose=self.verbose)
                            # Could get a 'Cannot allocate memory' if we are using too many threads...
                            thread.start()
                            threads.append(thread)
                        except OSError as inst:
                            # Should we try to push self.q_in[i] data to some other thread?
                            msg = "[{}.process_batch: error] Could not start thread #{}: {}"
                            print(msg.format(self.pp, i + 1, inst))
                            thread_creation_failed[i] = 1
                            time.sleep(10 * sum(thread_creation_failed))

                if sum(thread_creation_failed) == self.nb_threads:
                    # We are in trouble...
                    raise ValueError("Could not start any thread...")

                nb_threads_running = len(threads)
                start_process = time.time()
                stop = time.time() + self.max_proc_time
                # Wait for all tasks to be marked as done
                threads_finished = [0] * nb_threads_running
                deleted_extr = [0] * nb_threads_running
                thread_msg = "[{}] Thread {}/{} (pid: {}) "
                while sum(threads_finished) < nb_threads_running:
                    for i in range(nb_threads_running):
                        if sum(threads_finished) == nb_threads_running:
                            sys.stdout.flush()
                            break
                        if threads_finished[i] == 1:
                            continue
                        i_q_in = i + sum(thread_creation_failed[:i + 1])
                        if q_in_size[i_q_in] > 0:
                            # This seems to block forever sometimes, if subprocess crashed?...
                            #self.q_in[i].join()
                            # Manual join with timeout...
                            # https://github.com/python/cpython/blob/3.6/Lib/multiprocessing/queues.py
                            if not self.q_in[
                                    i_q_in]._unfinished_tasks._semlock._is_zero(
                                    ) and time.time() < stop:
                                time.sleep(1)
                            else:
                                if self.q_in[
                                        i_q_in]._unfinished_tasks._semlock._is_zero(
                                        ):
                                    if self.verbose > 5:
                                        msg = thread_msg + "marked as finished because processing seems finished"
                                        print(
                                            msg.format(self.pp, i + 1,
                                                       nb_threads_running,
                                                       threads[i].pid))
                                else:
                                    if self.verbose > 0:
                                        # In this cases does this happen...
                                        msg = thread_msg + "force marked task as done as max_proc_time ({}) has passed."
                                        print(
                                            msg.format(self.pp, i + 1,
                                                       nb_threads_running,
                                                       threads[i].pid,
                                                       self.max_proc_time))
                                        sys.stdout.flush()
                                        # Try to delete corresponding extractor to free memory?
                                        # And reduce number of threads at the end of the loop
                                    try:
                                        self.q_in[i_q_in].task_done()
                                        if deleted_extr[i] == 0:
                                            # we pushed the extractor as self.extractors[i] in a loop of self.nb_threads
                                            # we use i_q_in
                                            del self.extractors[i_q_in]
                                            deleted_extr[i] = 1
                                    except Exception:
                                        pass
                                threads_finished[i] = 1
                        else:
                            if self.verbose > 2:
                                # We actually never gave something to process...
                                msg = thread_msg + "marked as finished because no data was passed to it"
                                print(
                                    msg.format(self.pp, i + 1,
                                               nb_threads_running,
                                               threads[i].pid))
                            threads_finished[i] = 1

                # Cleanup threads to free memory before getting data back
                # Daemon may still be running...
                # and will actually be deleted only when they exit after not getting a batch
                del threads

                # Gather results
                q_out_size = []
                q_out_size_tot = 0
                for i in range(self.nb_threads):
                    q_out_size.append(self.q_out[i].qsize())
                    q_out_size_tot += q_out_size[i]

                if self.verbose > 5:
                    print("[{}: log] Total output queues size is: {}".format(
                        self.pp, q_out_size_tot))
                    sys.stdout.flush()

                # Can get stuck here?
                dict_imgs = dict()
                for i in range(self.nb_threads):
                    if self.verbose > 4:
                        print("[{}] Thread {} q_out_size: {}".format(
                            self.pp, i + 1, q_out_size[i]))
                        sys.stdout.flush()
                    while q_out_size[i] > 0 and not self.q_out[i].empty():
                        if self.verbose > 6:
                            print("[{}] Thread {} q_out is not empty.".format(
                                self.pp, i + 1))
                            sys.stdout.flush()
                        try:
                            batch_out = self.q_out[i].get(True, 10)
                            if self.verbose > 4:
                                msg = "[{}] Got batch of {} features from thread {} q_out."
                                print(
                                    msg.format(self.pp, len(batch_out), i + 1))
                                sys.stdout.flush()
                            for sha1, dict_out in batch_out:
                                dict_imgs[sha1] = dict_out
                        except:
                            if self.verbose > 1:
                                print(
                                    "[{}] Thread {} failed to get from q_out: {}"
                                    .format(self.pp, i + 1))
                                sys.stdout.flush()
                            #pass
                        if self.verbose > 4:
                            print(
                                "[{}] Marking task done in q_out of thread {}."
                                .format(self.pp, i + 1))
                            sys.stdout.flush()
                        self.q_out[i].task_done()

                #if self.verbose > 0:
                print_msg = "[{}] Got features for {}/{} images in {}s."
                proc_time = time.time() - start_process
                print(
                    print_msg.format(self.pp, len(dict_imgs.keys()),
                                     len(list_in), proc_time))
                sys.stdout.flush()
                # --------

                # Push them
                # DONE: use out_indexer
                self.out_indexer.push_dict_rows(
                    dict_rows=dict_imgs,
                    table_name=self.out_indexer.table_sha1infos_name)

                # Mark batch as processed
                now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S')
                # DONE: use out_indexer
                update_processed_dict = {
                    update_id: {
                        self.out_indexer.get_col_upproc(): now_str
                    }
                }
                self.out_indexer.push_dict_rows(
                    dict_rows=update_processed_dict,
                    table_name=self.out_indexer.table_updateinfos_name)

                # Mark as completed if all rows had an extraction
                if len(rows_batch) == len(dict_imgs.keys()):
                    # DONE: use out_indexer
                    update_completed_dict = {
                        update_id: {
                            self.out_indexer.get_col_upcomp(): str(1)
                        }
                    }
                    self.out_indexer.push_dict_rows(
                        dict_rows=update_completed_dict,
                        table_name=self.out_indexer.table_updateinfos_name)

                # Cleanup
                del self.q_in
                del self.q_out

                # To try to adjust a too optimistic nb_threads setting
                # if (sum(thread_creation_failed) > 0 or sum(deleted_extr) > 0) and self.nb_threads > 2:
                #   self.nb_threads -= 1

                msg = "[{}] Completed update {} in {}s."
                print(
                    msg.format(self.pp, update_id,
                               time.time() - start_update))
                sys.stdout.flush()
                self.nb_err = 0

                # Force garbage collection?
                gc.collect()

                # Should we just raise an Exception and restart clean?
                if sum(thread_creation_failed) > 0 or sum(deleted_extr) > 0:
                    raise ValueError(
                        "Something went wrong. Trying to restart clean")

        except Exception as inst:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fulltb = traceback.format_tb(exc_tb)
            raise type(inst)(" {} ({})".format(inst, ''.join(fulltb)))

    def run(self):
        """Run processor
    """
        self.nb_empt = 0
        self.nb_err = 0
        while True:
            self.process_batch()
            msg = "[ExtractionProcessor: log] Nothing to process at: {}"
            print(msg.format(datetime.now().strftime('%Y-%m-%d:%H.%M.%S')))
            sys.stdout.flush()
            time.sleep(10 * self.nb_empt)
            self.nb_empt += 1
Example #3
0
class ExtractionChecker(ConfReader):
    """ExtractionChecker class
  """
    def __init__(self,
                 global_conf,
                 prefix=DEFAULT_EXTR_CHECK_PREFIX,
                 pid=None):
        """ExtractionChecker constructor

    :param global_conf_in: configuration file or dictionary
    :type global_conf_in: str, dict
    :param prefix: prefix in configuration
    :type prefix: str
    :param pid: process id
    :type pid: int
    """
        self.list_extr_prefix = []
        self.pid = pid
        self.dict_sha1_infos = dict()

        super(ExtractionChecker, self).__init__(global_conf, prefix)

        self.last_push = time.time()
        self.nb_imgs_check = 0
        self.nb_imgs_unproc = 0
        self.nb_imgs_unproc_lastprint = 0

        self.featurizer_type = self.get_required_param("featurizer_type")
        self.detector_type = self.get_required_param("detector_type")
        self.input_type = self.get_required_param("input_type")

        # Max delay
        self.max_delay = int(
            self.get_param("max_delay", default=DEFAULT_MAX_DELAY))
        self.min_len_check = int(
            self.get_param("min_len_check", default=DEFAULT_MIN_LENGTH_CHECK))

        self.list_extr_prefix = [
            self.featurizer_type, "feat", self.detector_type, self.input_type
        ]
        self.extr_prefix = "_".join(self.list_extr_prefix)
        self.batch_check_column = None
        self.check_columns = []

        # changed to: get column family from indexer in set_check_columns
        # Need to be build from extraction type and detection input + "_processed"
        #self.extr_family_column = self.get_param("extr_family_column", default="ext")
        # self.extr_prefix_base_column_name = self.extr_family_column + ":" + self.extr_prefix
        # self.extr_check_column = self.extr_prefix_base_column_name + "_processed"
        # # Need to be build from extraction type and extraction input + "_batchid"
        # self.batch_check_column = self.extr_prefix_base_column_name + "_updateid"
        # self.check_columns = [self.extr_check_column, self.batch_check_column]

        self.set_pp()

        # Initialize indexer
        self.indexer = HBaseIndexerMinimal(
            self.global_conf, prefix=self.get_required_param("indexer_prefix"))
        self.indexer.pp = "CheckerHBase"
        print(self.get_required_param("indexer_prefix"),
              self.indexer.get_dictcf_sha1_table())
        self.set_check_columns()
        print(self.check_columns)

        # Initialize ingester, that could now be Kafka or Kinesis. Should we have a default?
        ingester_type = self.get_required_param("image_ingestion_type")
        try:
            if ingester_type == "kafka":
                self.ingester = KafkaIngester(
                    self.global_conf,
                    prefix=self.get_required_param("check_ingester_prefix"))
            elif ingester_type == "kinesis":
                self.ingester = KinesisIngester(
                    self.global_conf,
                    prefix=self.get_required_param("check_ingester_prefix"))
            else:
                raise ValueError(
                    "Unknown 'ingester_type': {}".format(ingester_type))
        except Exception as inst:
            # print "Could not initialize checker, sleeping for {}s.".format(self.max_delay)
            # time.sleep(self.max_delay)
            # raise(inst)
            #print("Could not initialize 'updates_out_topic' ({}). Will write only to HBase.".format(inst))
            print("[{}: ERROR] Could not start ingester.".format(
                self.pp, inst))
            raise inst

        # Initialize producer
        # TODO: also check for 'update_ingestion_type' as producer_type?
        producer_type = self.get_param("update_ingestion_type",
                                       DEFAULT_UPDATE_INGESTION_TYPE)
        # TODO: create a producer if 'update_ingestion_type' is Kinesis or Kafka
        # if producer_type != "hbase":
        #   self.updates_out_topic = self.ingester.get_required_param("producer_updates_out_topic")
        if producer_type == "kafka":
            self.pusher = KafkaPusher(
                self.global_conf,
                prefix=self.get_required_param("check_ingester_prefix"))
        elif producer_type == "kinesis":
            self.pusher = KinesisPusher(
                self.global_conf,
                prefix=self.get_required_param("check_ingester_prefix"))
        elif producer_type == "hbase":
            self.pusher = None
            print("[{}: log] Will write updates only to HBase.".format(
                self.pp))
        else:
            raise ValueError(
                "Unknown 'producer_type': {}".format(producer_type))
        #self.ingester.pp = self.get_param("pp", "ImageIngester")

        # Only if daemon mode, as we may have multiple ingesters
        # But for Kinesis the `shard_infos_filename` may not be re-used...
        #if self.pid:
        #  self.ingester.pp += str(self.pid)

    def set_check_columns(self):
        """Set columns to be checked in indexer
    """
        # changed to: get column family from indexer
        # TODO: get the suffixes as global variables maybe from common.defaults
        extr_prefix_base_column_name = self.indexer.extrcf + ":" + self.extr_prefix
        extr_check_column = extr_prefix_base_column_name + "_processed"
        # Need to be build from extraction type and extraction input + "_batchid"
        self.batch_check_column = extr_prefix_base_column_name + "_updateid"
        self.check_columns = [extr_check_column, self.batch_check_column]
        #print(self.check_columns)

    def set_pp(self, pp=""):
        """Set pretty name
    """
        self.pp = "ExtractionChecker"
        self.pp += "-".join(self.list_extr_prefix)
        if self.pid:
            self.pp += "." + str(self.pid)

    def store_img_infos(self, msg):
        """Store information about the images of ``msg`` in ``self.dict_sha1_infos``

    :param msg: message
    :type msg: dict
    """
        strk = str(msg['sha1']).upper()
        self.dict_sha1_infos[strk] = dict()
        for key in msg:
            # dumps json of 'img_info'
            # We actually need that only for DIG...
            if key == "img_info":
                self.dict_sha1_infos[strk][key] = json.dumps(msg[key])
            else:
                # discard 'img_buffer' (if it exists?...), and 'sha1'
                # if k != "img_buffer" and k != "sha1":
                #  self.dict_sha1_infos[strk][k] = msg[k]
                # discard 'sha1'
                if key != "sha1":
                    self.dict_sha1_infos[strk][key] = msg[key]

    def cleanup_dict_infos(self, list_del_sha1s):
        """Remove images ``list_del_sha1s`` from ``self.dict_sha1_infos``

    :param list_del_sha1s: list of images sha1 to remove
    :type list_del_sha1s: list
    """
        for sha1 in list_del_sha1s:
            try:
                del self.dict_sha1_infos[str(sha1)]
            except:
                # could happen when cleaning up duplicates or image processed by another process
                pass

    def get_dict_push(self, list_get_sha1s, daemon=False):
        """Get dictionary to be pushed to HBase for images in ``list_get_sha1s``

    :param list_get_sha1s: list of images
    :type list_get_sha1s: list
    :param daemon: whether the checker is running in daemon mode
    :type daemon: bool
    :return: (dict_push, update_id)
    :rtype: tuple
    """
        #TODO: is this needed for every get_dict_push call?
        self.set_check_columns()
        # TODO: also pass current update_id, and move the creation of update id out of this method
        #  this method should actually be used to 'claim' an image as soon as we can.
        dict_push = dict()
        # append processid to 'update_id' for safe use with multiple consumers, even after restart
        # /!\ beware, it should not contain underscores
        tmp_update_id, _ = self.indexer.get_next_update_id(
            today=None, extr_type=self.extr_prefix)
        update_id = tmp_update_id + '-' + self.ingester.pp + '-' + str(
            time.time())
        for sha1 in list_get_sha1s:
            dict_push[str(sha1)] = dict()
            try:
                tmp_dict = self.dict_sha1_infos[str(sha1)]
            except:
                # This would mean the image has been marked as part of another batch by another process,
                # and thus deleted in a previous 'get_unprocessed_rows' call
                # This is also only relevant if we run on Daemon mode...
                # TODO: for transition we won't really have any info to push except the update_id...
                if daemon:
                    del dict_push[str(sha1)]
                    continue
            # build column names properly i.e. appending 'info:'
            for key in tmp_dict:
                # changed to: use column_family from indexer
                # But the use of 'key' here also means we rely on the input to define column name...
                #dict_push[str(sha1)]['info:' + key] = tmp_dict[key]
                dict_push[str(sha1)][self.indexer.imginfocf + ':' +
                                     key] = tmp_dict[key]
            dict_push[str(sha1)][self.batch_check_column] = update_id
        return dict_push, update_id

    def get_unprocessed_rows(self, list_check_sha1s):
        """Get the subset of the list of sha1s ``list_check_sha1s`` that have not been processed yet

    :param list_check_sha1s: list of images sha1 to check
    :type list_check_sha1s: list
    :return: set of unprocessed images
    :rtype: set
    """
        # TODO: also pass current update_id and only delete if != from current update...

        unprocessed_rows = set(list_check_sha1s)

        if list_check_sha1s:
            # Check if the selected sha1 rows in HBase table 'sha1infos' have those check_column
            # This call will only return rows that DO have those check_column
            fam = self.indexer.get_dictcf_sha1_table()
            try:
                sha1s_rows = self.indexer.get_columns_from_sha1_rows(
                    list_check_sha1s, self.check_columns, families=fam)
            except Exception as inst:
                print("[{}.get_unprocessed_rows: log] fam: {}".format(
                    self.pp, fam))
                raise inst

                #families=self.tablesha1_col_families)
            if sha1s_rows:
                # TODO: only delete if really previously processed, i.e. if != from current update...
                found_sha1_rows = set([str(row[0]) for row in sha1s_rows])
                # Clean up 'dict_sha1_infos' deleting found_sha1_rows
                self.cleanup_dict_infos(found_sha1_rows)
                set_list_check_sha1s = set(list_check_sha1s)
                # TODO: but we should not re-add them, so we should discard them from unprocessed_rows
                unprocessed_rows = set_list_check_sha1s - found_sha1_rows

        return unprocessed_rows

    def run(self, daemon=False):
        """Run extraction checker

    :param daemon: whether we are running in daemon mode
    :type daemon: bool
    :raises Exception: if check fails
    """
        # import inspect
        # if not inspect.isgeneratorfunction(self.ingester.get_msg_json()):
        #   msg = "[{}: Warning] Ingester {} function `get_msg_json` is not a generator"
        #   print(msg.format(self.pp, type(self.ingester)))

        try:
            list_sha1s_to_process = []
            list_check_sha1s = []
            # TODO: create update_id here

            if self.verbose > 1:
                msg = "[{}: log] Start run main loop"
                msg.format(self.pp)

            while True:

                try:
                    # Accumulate images infos
                    #while len(list_check_sha1s) < self.indexer.batch_update_size:
                    #while len(list_check_sha1s) < self.min_len_check:

                    for msg in self.ingester.get_msg_json():
                        try:
                            # Fix if input was JSON dumped twice?
                            if not isinstance(msg, dict):
                                msg = json.loads(msg)
                            # msg could now contain keys 'sha1' or 'list_sha1s'
                            if 'sha1' in msg:
                                list_check_sha1s.append(
                                    str(msg['sha1']).upper())
                                # Store other fields to be able to push them too
                                self.store_img_infos(msg)
                            elif 'list_sha1s' in msg:
                                for sha1 in msg['list_sha1s']:
                                    list_check_sha1s.append(str(sha1).upper())
                                    # We won't have any additional infos no?
                                    # But we should still build a dict for each sample for consistency...
                                    tmp_dict = dict()
                                    tmp_dict['sha1'] = str(sha1).upper()
                                    # will basically push a dict with just the sha1 to self.dict_sha1_infos, so self.get_dict_push
                                    # works properly later on...
                                    self.store_img_infos(tmp_dict)
                            else:
                                raise ValueError(
                                    'Unknown keys in msg: {}'.format(
                                        msg.keys()))
                            # This is dangerous, as it assumes the self.ingester.get_msg_json() generator
                            # would restart from the next point... Is this the case for Kafka?
                            prev_len = len(list_check_sha1s)
                            list_check_sha1s = list(set(list_check_sha1s))
                            if len(list_check_sha1s) < prev_len:
                                msg = "[{}: log] Removed {} duplicate from `list_check_sha1s`"
                                print(
                                    msg.format(
                                        self.pp,
                                        prev_len - len(list_check_sha1s)))
                            if len(list_check_sha1s
                                   ) >= self.indexer.batch_update_size:
                                break
                        except Exception as inst:
                            pr_msg = "[{}: ERROR] Could not process message: {}. {}"
                            print(pr_msg.format(self.pp, msg, inst))
                except Exception as inst:
                    pr_msg = "[{}: at {} ERROR] Caught {} {} in consumer loop"
                    now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S')
                    print(pr_msg.format(self.pp, now_str, type(inst), inst))
                    if msg is not None:
                        print(msg)
                    sys.stdout.flush()

                if self.verbose > 3:
                    msg = "[{}: log] Gathered {} images to check so far"
                    msg = msg.format(self.pp, len(list_check_sha1s))
                    msg2 = ""
                    if len(list_check_sha1s) > 0:
                        msg2 = " (first: {}, last: {})"
                        msg2 = msg2.format(list_check_sha1s[0],
                                           list_check_sha1s[-1])
                    print(msg + msg2)

                # To be able to push one (non empty) update every max_delay
                #if not list_check_sha1s and (time.time() - self.last_push) < self.max_delay:
                if len(list_check_sha1s) < self.indexer.batch_update_size and (
                        time.time() - self.last_push) < self.max_delay:
                    time.sleep(1)
                    continue

                self.nb_imgs_check += len(list_check_sha1s)
                push_delay = (time.time() - self.last_push) > max(
                    int(self.max_delay / 60), 10)
                if push_delay and self.nb_imgs_unproc_lastprint != self.nb_imgs_unproc:
                    msg = "[{}: log] Pushed {} unprocessed images so far"
                    print(
                        msg.format(self.pp, self.nb_imgs_unproc,
                                   self.nb_imgs_check))
                    self.nb_imgs_unproc_lastprint = self.nb_imgs_unproc

                if list_check_sha1s:
                    # Check which images have not been processed (or pushed in an update) yet
                    # This seems slow
                    start_check = time.time()
                    unprocessed_rows = self.get_unprocessed_rows(
                        list_check_sha1s)
                    msg = "[{}: log] Found {}/{} unprocessed images in {:.2f}s"
                    print(
                        msg.format(self.pp, len(unprocessed_rows),
                                   len(list_check_sha1s),
                                   time.time() - start_check))
                    if len(unprocessed_rows) != len(
                            list_check_sha1s) and self.verbose > 5:
                        already_processed = list(
                            set(list_check_sha1s) - set(unprocessed_rows))
                        msg = "[{}: log] Images ".format(self.pp)
                        for ap in already_processed:
                            msg += "{} ".format(ap)
                        msg += "were already processed."
                        print(msg)

                    #unprocessed_rows = self.get_unprocessed_rows(list_check_sha1s)

                    # TODO: we should mark those images as being 'owned' by the update we are constructing
                    # (only important if we are running multiple threads i.e. daemon is True)
                    # otherwise another update running at the same time could also claim it (in another ad)
                    # could be handle when adding data to the searcher but duplicates in extraction process...

                    # Push sha1s to be processed
                    for sha1 in unprocessed_rows:
                        list_sha1s_to_process.append(sha1)

                    # Remove potential duplicates
                    list_sha1s_to_process = list(set(list_sha1s_to_process))
                    list_check_sha1s = []

                if list_sha1s_to_process:
                    # Push them to HBase by batch of 'batch_update_size'
                    push_delay = (time.time() -
                                  self.last_push) > self.max_delay
                    full_batch = len(list_sha1s_to_process
                                     ) >= self.indexer.batch_update_size
                    if full_batch or (push_delay and list_sha1s_to_process):
                        # Trim here to push exactly a batch of 'batch_update_size'
                        list_push = list_sha1s_to_process[:min(
                            self.indexer.
                            batch_update_size, len(list_sha1s_to_process))]

                        # TODO: this should be done before,
                        # to 'claim' the images as soon as we plan to process them for this update
                        # Gather corresponding sha1 infos
                        dict_push, update_id = self.get_dict_push(
                            list_push, daemon=daemon)
                        if dict_push:
                            self.nb_imgs_unproc += len(dict_push.keys())
                            msg = "[{}: at {}] Pushing update {} of {} images."
                            now_str = datetime.now().strftime(
                                '%Y-%m-%d:%H.%M.%S')
                            print(
                                msg.format(self.pp, now_str, update_id,
                                           len(dict_push.keys())))
                            sys.stdout.flush()

                            # Push images
                            fam = self.indexer.get_dictcf_sha1_table()
                            if self.verbose > 5:
                                msg = "[{}] Pushing images for update {} with fam {}"
                                print(msg.format(self.pp, update_id, fam))
                            sha1s_table = self.indexer.table_sha1infos_name
                            self.indexer.push_dict_rows(dict_push,
                                                        sha1s_table,
                                                        families=fam)

                            # Build HBase updates dict
                            dict_updates_db = dict()
                            now_str = datetime.now().strftime(
                                '%Y-%m-%d:%H.%M.%S')
                            list_sha1s_col = self.indexer.get_col_listsha1s()
                            dict_updates_db[update_id] = {
                                list_sha1s_col: ','.join(dict_push.keys()),
                                self.indexer.get_col_upcreate(): now_str
                            }
                            # Push it
                            fam = self.indexer.get_dictcf_update_table()
                            if self.verbose > 5:
                                msg = "[{}] Pushing update {} info with fam {}"
                                print(msg.format(self.pp, update_id, fam))
                            self.indexer.push_dict_rows(
                                dict_updates_db,
                                self.indexer.table_updateinfos_name,
                                families=fam)

                            # Build pusher updates dict
                            if self.pusher is not None:
                                dict_updates_kafka = dict()
                                dict_updates_kafka[update_id] = ','.join(
                                    dict_push.keys())
                                # Push it
                                #self.ingester.producer.send(self.updates_out_topic, json.dumps(dict_updates_kafka))
                                #self.pusher.send(self.updates_out_topic, dict_updates_kafka)
                                self.pusher.send(dict_updates_kafka)

                            # Gather any remaining sha1s and clean up infos
                            if len(list_sha1s_to_process
                                   ) > self.indexer.batch_update_size:
                                list_sha1s_to_process = list_sha1s_to_process[
                                    self.indexer.batch_update_size:]
                            else:
                                list_sha1s_to_process = []
                            # if duplicates wrt list_push, remove them. Can this still happen?
                            list_sha1s_to_process = [
                                sh1 for sh1 in list_sha1s_to_process
                                if sh1 not in list_push
                            ]
                            self.cleanup_dict_infos(list_push)
                        else:
                            msg = "[{}: at {}] Nothing to push for update {}"
                            print(
                                msg.format(
                                    self.pp,
                                    datetime.now().strftime(
                                        '%Y-%m-%d:%H.%M.%S'), update_id))
                            sys.stdout.flush()
                        self.last_push = time.time()
                        # TODO: we should create a new update_id here,
                        # and let it claim the potential remaining images in 'list_sha1s_to_process'
                        # sanity check that len(list_sha1s_to_process) == len(self.dict_sha1_infos) ?

                    else:
                        if self.verbose > 3:
                            msg = "[{}: at {}] Gathered {} images so far..."
                            now_str = datetime.now().strftime(
                                '%Y-%m-%d:%H.%M.%S')
                            print(
                                msg.format(self.pp, now_str,
                                           len(list_sha1s_to_process)))

        except Exception as inst:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fulltb = traceback.format_tb(exc_tb)
            raise type(inst)(" {} ({})".format(inst, ''.join(fulltb)))