コード例 #1
0
  def __init__(self, global_conf, prefix=default_extr_check_prefix, pid=None):
    self.list_extr_prefix = []
    self.pid = pid
    self.dict_sha1_infos = dict()

    super(ExtractionChecker, self).__init__(global_conf, prefix)

    self.featurizer_type = self.get_required_param("featurizer_type")
    self.detector_type = self.get_required_param("detector_type")
    self.input_type = self.get_required_param("input_type")

    # Need to be build from extraction type and detection input + "_processed"
    self.extr_family_column = "ext"
    tmp_extr_family_column = self.get_param("extr_family_column")
    if tmp_extr_family_column:
      self.extr_family_column = tmp_extr_family_column

    # Max delay
    self.max_delay = 3600
    # self.max_delay = 600
    max_delay = self.get_param("max_delay")
    if max_delay:
      self.max_delay = int(max_delay)
    self.last_push = time.time()
    self.nb_imgs_check = 0
    self.nb_imgs_unproc = 0
    self.nb_imgs_unproc_lastprint = 0

    # Beware, the self.extr_family_column should be added to the indexer families parameter in get_create_table...
    # TODO: should we add the 'ad' column family too here by default
    self.tablesha1_col_families = {'info': dict(), self.extr_family_column: dict()}
    self.list_extr_prefix = [self.featurizer_type, "feat", self.detector_type, self.input_type]
    self.extr_prefix = "_".join(self.list_extr_prefix)
    self.extr_prefix_base_column_name = self.extr_family_column + ":" + self.extr_prefix
    self.extr_check_column = self.extr_prefix_base_column_name + "_processed"
    # Need to be build from extraction type and extraction input + "_batchid"
    self.batch_check_column = self.extr_prefix_base_column_name + "_updateid"
    self.check_columns = [self.extr_check_column, self.batch_check_column]

    self.set_pp()

    # Initialize indexer and ingester
    self.indexer = HBaseIndexerMinimal(self.global_conf,
                                       prefix=self.get_required_param("indexer_prefix"))
    self.ingester = GenericKafkaProcessor(self.global_conf,
                                          prefix=self.get_required_param("check_ingester_prefix"))
    # This will not be set for HBase processing, but checker would keep dying here...
    try:
      self.updates_out_topic = self.ingester.get_required_param("producer_updates_out_topic")
    except Exception as inst:
      print "Could not initialize checker, sleeping for {}s.".format(self.max_delay)
      time.sleep(self.max_delay)
      raise(inst)
    self.ingester.pp = "ec"
    if self.pid:
      self.ingester.pp += str(self.pid)
コード例 #2
0
  def __init__(self, global_conf, prefix=default_extr_proc_prefix):
    self.extractor = None
    self.nb_empt = 0
    self.nb_err = 0
    self.max_proc_time = 1200 # in seconds. Increased for sbcmdline...
    self.url_input = True

    super(ExtractionProcessor, self).__init__(global_conf, prefix)

    self.input_type = self.get_required_param("input_type")
    self.nb_threads = self.get_required_param("nb_threads")
    self.featurizer_type = self.get_required_param("featurizer_type")
    self.featurizer_prefix = self.get_required_param("featurizer_prefix")
    self.detector_type = self.get_required_param("detector_type")

    # Means we extract feature from the whole image
    if self.detector_type == "full":
      self.detector = None

    self.verbose = 0
    verbose = self.get_param("verbose")
    if verbose:
      self.verbose = int(verbose)

    self.ingestion_input = "kafka"
    ingestion_input = self.get_param("ingestion_input")
    if ingestion_input:
      self.ingestion_input = ingestion_input

    file_input = self.get_param("file_input")
    print("[{}.ExtractionProcessor: log] file_input: {}".format(self.pp, file_input))
    if file_input:
      self.url_input = False
    print("[{}.ExtractionProcessor: log] url_input: {}".format(self.pp, self.url_input))

    if self.url_input:
      self.img_column =  img_URL_column
    else:
      self.img_column = img_path_column
    print("[{}.ExtractionProcessor: log] img_column: {}".format(self.pp, self.img_column))

    # Need to be build from extraction type and detection input + "_processed"
    self.extr_family_column = "ext"
    tmp_extr_family_column = self.get_param("extr_family_column")
    if tmp_extr_family_column:
      self.extr_family_column = tmp_extr_family_column

    self.push_back = False
    push_back = self.get_param("push_back")
    if push_back:
      self.push_back = True

    self.extr_prefix = build_extr_str(self.featurizer_type, self.detector_type, self.input_type)
    self.set_pp()

    # Initialize queues
    self.init_queues()

    # Initialize extractors only once (just one first)
    self.extractors = []
    #for i in range(self.nb_threads):
    #  self.extractors.append(GenericExtractor(self.detector_type, self.featurizer_type, self.input_type,
    #                                  self.extr_family_column, self.featurizer_prefix, self.global_conf))
    self.extractors.append(GenericExtractor(self.detector_type, self.featurizer_type,
                                            self.input_type, self.extr_family_column,
                                            self.featurizer_prefix, self.global_conf))

    # Beware, the self.extr_family_column should be added to the indexer families parameter in get_create_table...
    # What if the table has some other column families?...
    self.tablesha1_col_families = {'info': dict(), self.extr_family_column: dict()}

    # Initialize indexer
    self.indexer = HBaseIndexerMinimal(self.global_conf, prefix=self.get_required_param("indexer_prefix"))
    self.last_update_date_id = ''

    # Initialize ingester
    self.ingester = GenericKafkaProcessor(self.global_conf, prefix=self.get_required_param("proc_ingester_prefix"))
    self.ingester.pp = "ep"
コード例 #3
0
class ExtractionProcessor(ConfReader):

  def __init__(self, global_conf, prefix=default_extr_proc_prefix):
    self.extractor = None
    self.nb_empt = 0
    self.nb_err = 0
    self.max_proc_time = 1200 # in seconds. Increased for sbcmdline...
    self.url_input = True

    super(ExtractionProcessor, self).__init__(global_conf, prefix)

    self.input_type = self.get_required_param("input_type")
    self.nb_threads = self.get_required_param("nb_threads")
    self.featurizer_type = self.get_required_param("featurizer_type")
    self.featurizer_prefix = self.get_required_param("featurizer_prefix")
    self.detector_type = self.get_required_param("detector_type")

    # Means we extract feature from the whole image
    if self.detector_type == "full":
      self.detector = None

    self.verbose = 0
    verbose = self.get_param("verbose")
    if verbose:
      self.verbose = int(verbose)

    self.ingestion_input = "kafka"
    ingestion_input = self.get_param("ingestion_input")
    if ingestion_input:
      self.ingestion_input = ingestion_input

    file_input = self.get_param("file_input")
    print("[{}.ExtractionProcessor: log] file_input: {}".format(self.pp, file_input))
    if file_input:
      self.url_input = False
    print("[{}.ExtractionProcessor: log] url_input: {}".format(self.pp, self.url_input))

    if self.url_input:
      self.img_column =  img_URL_column
    else:
      self.img_column = img_path_column
    print("[{}.ExtractionProcessor: log] img_column: {}".format(self.pp, self.img_column))

    # Need to be build from extraction type and detection input + "_processed"
    self.extr_family_column = "ext"
    tmp_extr_family_column = self.get_param("extr_family_column")
    if tmp_extr_family_column:
      self.extr_family_column = tmp_extr_family_column

    self.push_back = False
    push_back = self.get_param("push_back")
    if push_back:
      self.push_back = True

    self.extr_prefix = build_extr_str(self.featurizer_type, self.detector_type, self.input_type)
    self.set_pp()

    # Initialize queues
    self.init_queues()

    # Initialize extractors only once (just one first)
    self.extractors = []
    #for i in range(self.nb_threads):
    #  self.extractors.append(GenericExtractor(self.detector_type, self.featurizer_type, self.input_type,
    #                                  self.extr_family_column, self.featurizer_prefix, self.global_conf))
    self.extractors.append(GenericExtractor(self.detector_type, self.featurizer_type,
                                            self.input_type, self.extr_family_column,
                                            self.featurizer_prefix, self.global_conf))

    # Beware, the self.extr_family_column should be added to the indexer families parameter in get_create_table...
    # What if the table has some other column families?...
    self.tablesha1_col_families = {'info': dict(), self.extr_family_column: dict()}

    # Initialize indexer
    self.indexer = HBaseIndexerMinimal(self.global_conf, prefix=self.get_required_param("indexer_prefix"))
    self.last_update_date_id = ''

    # Initialize ingester
    self.ingester = GenericKafkaProcessor(self.global_conf, prefix=self.get_required_param("proc_ingester_prefix"))
    self.ingester.pp = "ep"


  def set_pp(self):
    self.pp = "ExtractionProcessor"
    if self.extractor:
      self.pp += "_"+self.extr_prefix

  def init_queues(self):
    from multiprocessing import JoinableQueue
    self.q_in = []
    self.q_out = []
    for i in range(self.nb_threads):
      self.q_in.append(JoinableQueue(0))
      self.q_out.append(JoinableQueue(0))

  def get_batch_hbase(self):
    # legacy implementation: better to have a kafka topic for batches to be processed to allow
    #       safe parallelization on different machines
    try:
      for updates in self.indexer.get_unprocessed_updates_from_date(self.last_update_date_id,
                                                                    extr_type=self.extr_prefix):
        for update_id, update_cols in updates:
          if self.extr_prefix in update_id:
            # double check update has not been processed somewhere else
            if self.is_update_unprocessed(update_id):
              # double check update was not marked as started recently i.e. by another process
              if self.is_update_notstarted(update_id, max_delay=TIME_ELAPSED_FAILED):
                list_sha1s = update_cols[column_list_sha1s].split(',')
                log_msg = "[{}.get_batch_hbase: log] Update {} has {} images."
                print(log_msg.format(self.pp, update_id, len(list_sha1s)))
                # also get 'ext:' to check if extraction was already processed?
                rows_batch = self.indexer.get_columns_from_sha1_rows(list_sha1s,
                                                                     columns=[img_buffer_column,
                                                                              self.img_column])
                # print "rows_batch", rows_batch
                if rows_batch:
                  yield rows_batch, update_id
                  self.last_update_date_id = '_'.join(update_id.split('_')[-2:])
                else:
                  log_msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}"
                  print(log_msg.format(self.pp, update_id))
              else:
                log_msg = "[{}.get_batch_hbase: log] Skipping update started recently: {}"
                print(log_msg.format(self.pp, update_id))
                continue
            else:
              log_msg = "[{}.get_batch_hbase: log] Skipping already processed update: {}"
              print(log_msg.format(self.pp, update_id))
              continue
          else:
            if self.verbose > 6:
              log_msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type."
              print(log_msg.format(self.pp, update_id))
      else:
        print("[{}.get_batch_hbase: log] No unprocessed update found.".format(self.pp))
        # Look for updates that have some unprocessed images
        # TODO: wether we do that or not could be specified by a parameter
        # as this induces slow down during update...
        for updates in self.indexer.get_missing_extr_updates_from_date("1970-01-01",
                                                                       extr_type=self.extr_prefix):
          for update_id, update_cols in updates:
            if self.extr_prefix in update_id:
              if column_list_sha1s in update_cols:
                list_sha1s = update_cols[column_list_sha1s].split(',')
                log_msg = "[{}.get_batch_hbase: log] Update {} has {} images missing extractions."
                print(log_msg.format(self.pp, update_id, len(list_sha1s)))
                sys.stdout.flush()
                # also get 'ext:' to check if extraction was already processed?
                rows_batch = self.indexer.get_columns_from_sha1_rows(list_sha1s,
                                                                     columns=[img_buffer_column,
                                                                              self.img_column])
                if rows_batch:
                  yield rows_batch, update_id
                else:
                  log_msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}"
                  print(log_msg.format(self.pp, update_id))
              else:
                log_msg = "[{}.get_batch_hbase: log] Update {} has no images list."
                print(log_msg.format(self.pp, update_id))
            else:
              log_msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type."
              print(log_msg.format(self.pp, update_id))
        else:
          log_msg = "[{}.get_batch_hbase: log] No updates with missing extractions found."
          print(log_msg.format(self.pp))
          sys.stdout.flush()

    except Exception as inst:
      full_trace_error("[{}.get_batch_hbase: error] {}".format(self.pp, inst))


  def is_update_unprocessed(self, update_id):
    update_rows = self.indexer.get_rows_by_batch([update_id],
                                                 table_name=self.indexer.table_updateinfos_name)
    if update_rows:
      for row in update_rows:
        if info_column_family+":"+update_str_processed in row[1]:
          return False
    return True

  def is_update_notstarted(self, update_id, max_delay=None):
    """Check if an update was not started yet.

    :param update_id: update id
    :param max_delay: delay (in seconds) between marked start time and now to consider update failed
    :return: boolean
    """
    update_rows = self.indexer.get_rows_by_batch([update_id],
                                                 table_name=self.indexer.table_updateinfos_name)
    if update_rows:
      for row in update_rows:
        if info_column_family+":"+update_str_started in row[1]:
          if max_delay:
            # check that started was mark recently, if not it may mean the update processing failed
            start_str = row[1][info_column_family+":"+update_str_started]
            # start time format is '%Y-%m-%d:%H.%M.%S'
            start_dt = datetime.strptime(start_str, '%Y-%m-%d:%H.%M.%S')
            now_dt = datetime.now()
            diff_dt = now_dt - start_dt
            if diff_dt.total_seconds() > max_delay:
              return True
            else:
              return False
          else:
            return False
    return True

  def get_batch_kafka(self):
    # Read from a kafka topic to allow safer parallelization on different machines
    try:
      # Needs to read topic to get update_id and list of sha1s
      for msg in self.ingester.consumer:
        msg_dict = json.loads(msg.value)
        update_id = msg_dict.keys()[0]
        # NB: Try to get update info and check it was really not processed yet.
        if self.is_update_unprocessed(update_id):
          str_list_sha1s = msg_dict[update_id]
          list_sha1s = str_list_sha1s.split(',')
          print("[{}.get_batch_kafka: log] Update {} has {} images.".format(self.pp, update_id, len(list_sha1s)))
          # NB: we could also get 'ext:' of images to double check if extraction was already processed
          #rows_batch = self.indexer.get_columns_from_sha1_rows(list_sha1s, columns=["info:img_buffer"])
          if self.verbose > 3:
            print("[{}.get_batch_kafka: log] Looking for colums: {}".format(self.pp, [img_buffer_column, self.img_column]))
          rows_batch = self.indexer.get_columns_from_sha1_rows(list_sha1s, columns=[img_buffer_column, self.img_column])
          #print "rows_batch", rows_batch
          if rows_batch:
            if self.verbose > 4:
              print("[{}.get_batch_kafka: log] Yielding for update: {}".format(self.pp, update_id))
            yield rows_batch, update_id
            self.ingester.consumer.commit()
            if self.verbose > 4:
              print("[{}.get_batch_kafka: log] After yielding for update: {}".format(self.pp, update_id))
            self.last_update_date_id = '_'.join(update_id.split('_')[-2:])
          # Should we try to commit offset only at this point?
          else:
            print("[{}.get_batch_kafka: log] Did not get any image buffers for the update: {}".format(self.pp, update_id))
        else:
          print("[{}.get_batch_kafka: log] Skipping already processed update: {}".format(self.pp, update_id))
      else:
        print("[{}.get_batch_kafka: log] No update found.".format(self.pp))
        # Fall back to checking HBase for unstarted/unfinished updates
        for rows_batch, update_id in self.get_batch_hbase():
          yield rows_batch, update_id
    except Exception as inst:
      full_trace_error("[{}.get_batch_kafka: error] {}".format(self.pp, inst))


  def get_batch(self):
    if self.ingestion_input == "hbase":
      for rows_batch, update_id in self.get_batch_hbase():
        yield rows_batch, update_id
    else:
      for rows_batch, update_id in self.get_batch_kafka():
        yield rows_batch, update_id

  def process_batch(self):
    # Get a new update batch
    for rows_batch, update_id in self.get_batch():
      try:
        start_update = time.time()
        print("[{}] Processing update {} of {} rows.".format(self.pp, update_id, len(rows_batch)))
        sys.stdout.flush()

        # Initialize
        self.nb_empt = 0
        self.init_queues()
        threads = []

        # If we deleted an extractor at some point or for first batch
        nb_extr_to_create = self.nb_threads - len(self.extractors)
        if nb_extr_to_create:
          start_create_extractor = time.time()
          while len(self.extractors) < self.nb_threads:

            self.extractors.append(GenericExtractor(self.detector_type, self.featurizer_type, self.input_type,
                                                    self.extr_family_column, self.featurizer_prefix,
                                                    self.global_conf))
          buff_msg = "[{}] Created {} extractors in {}s."
          create_extr_time = time.time() - start_create_extractor
          print(buff_msg.format(self.pp, nb_extr_to_create, create_extr_time))


        # Mark batch as started to be process
        now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S')
        dict_val = {info_column_family + ':' + update_str_started: now_str}
        update_started_dict = {update_id: dict_val}
        self.indexer.push_dict_rows(dict_rows=update_started_dict,
                                    table_name=self.indexer.table_updateinfos_name)

        # Push images to queue
        list_in = []
        # For parallelized downloading...
        from Queue import Queue
        nb_imgs_dl = 0
        q_in_dl = Queue(0)
        q_out_dl = Queue(0)

        start_get_buffer = time.time()
        for img in rows_batch:
          # should decode base64
          if img_buffer_column in img[1]:
            tup = (img[0], img[1][img_buffer_column], False)
            list_in.append(tup)
          else:
            # need to re-download, accumulate a list of URLs to download
            # Deal with img_path_column for local_images_kafka_pusher
            if self.img_column in img[1]:
              q_in_dl.put((img[0], img[1][self.img_column], self.push_back))
              nb_imgs_dl += 1
            else:
              print("[{}: warning] No buffer and no URL/path for image {} !".format(self.pp, img[0]))
              continue

        # Download missing images
        if nb_imgs_dl > 0:
          threads_dl = []
          for i in range(min(self.nb_threads, nb_imgs_dl)):
            # should read (url, obj_pos) from self.q_in
            # and push (url, obj_pos, buffer, img_info, start_process, end_process) to self.q_out
            thread = ThreadedDownloaderBufferOnly(q_in_dl, q_out_dl, url_input=self.url_input)
            thread.start()
            threads_dl.append(thread)

          q_in_dl.join()

          # Push downloaded images to list_in too
          nb_dl = 0
          while nb_dl < nb_imgs_dl:
            # This can block?
            #sha1, buffer, push_back, inst = q_out_dl.get()
            try:
              sha1, buffer, push_back, inst = q_out_dl.get(True, 10)
            except Exception as queue_err:
              warn_msg = "[{}: error] Download queue out timed out: {}"
              print(warn_msg.format(self.pp, queue_err))
              break
            nb_dl += 1
            if inst:
              if self.verbose > 0:
                log_msg = "[{}: log] Could not download image {}, error was: {}"
                print(log_msg.format(self.pp, sha1, inst))
            else:
              if buffer:
                list_in.append((sha1, buffer, push_back))
              else:
                # Is that possible?
                err_msg = "[{}: error] No error but no buffer either for image {}"
                print(err_msg.format(self.pp, sha1))

        get_buffer_time = time.time() - start_get_buffer
        buff_msg = "[{}] Got {}/{} image buffers for update {} in {}s."
        print(buff_msg.format(self.pp, len(list_in), len(rows_batch), update_id, get_buffer_time))
        sys.stdout.flush()

        q_batch_size = int(math.ceil(float(len(list_in))/self.nb_threads))
        for i, q_batch in enumerate(build_batch(list_in, q_batch_size)):
          self.q_in[i].put(q_batch)

        q_in_size = []
        q_in_size_tot = 0
        for i in range(self.nb_threads):
          q_in_size.append(self.q_in[i].qsize())
          q_in_size_tot += q_in_size[i]
        if self.verbose > 5:
          print("[{}] Total input queues sizes is: {}".format(self.pp, q_in_size_tot))

        # Start daemons...
        thread_creation_failed = [0]*self.nb_threads
        for i in range(self.nb_threads):
          # one per non empty input queue
          if q_in_size[i] > 0:
            try:
              thread = DaemonBatchExtractor(self.extractors[i], self.q_in[i], self.q_out[i], verbose=self.verbose)
              # Could get a 'Cannot allocate memory' if we are using too many threads...
              thread.start()
              threads.append(thread)
            except OSError as inst:
              # Should we try to push self.q_in[i] data to some other thread?
              print("[{}.process_batch: error] Could not start thread #{}: {}".format(self.pp, i+1, inst))
              thread_creation_failed[i] = 1
              time.sleep(10*sum(thread_creation_failed))

        if sum(thread_creation_failed) == self.nb_threads:
          raise ValueError("Could not start any thread...")

        nb_threads_running = len(threads)
        start_process = time.time()
        stop = time.time() + self.max_proc_time
        # Wait for all tasks to be marked as done
        threads_finished = [0] * nb_threads_running
        deleted_extr = [0] * nb_threads_running
        thread_msg = "[{}] Thread {}/{} (pid: {}) "
        while sum(threads_finished) < nb_threads_running:
          for i in range(nb_threads_running):
            if sum(threads_finished) == nb_threads_running:
              sys.stdout.flush()
              break
            if threads_finished[i] == 1:
              continue
            i_q_in = i + sum(thread_creation_failed[:i + 1])
            if q_in_size[i_q_in] > 0:
              # This seems to block forever sometimes, if subprocess crashed?...
              #self.q_in[i].join()
              # Manual join with timeout...
              # https://github.com/python/cpython/blob/3.6/Lib/multiprocessing/queues.py
              if not self.q_in[i_q_in]._unfinished_tasks._semlock._is_zero() and time.time() < stop:
                time.sleep(1)
              else:
                if self.q_in[i_q_in]._unfinished_tasks._semlock._is_zero():
                  if self.verbose > 5:
                    msg = thread_msg+"marked as finished because processing seems finished"
                    print(msg.format(self.pp, i+1, nb_threads_running, threads[i].pid))
                else:
                  if self.verbose > 0:
                    # In this cases does this happen...
                    msg = thread_msg+"force marked task as done as max_proc_time ({}) has passed."
                    print(msg.format(self.pp, i+1, nb_threads_running, threads[i].pid,
                                     self.max_proc_time))
                    sys.stdout.flush()
                    # Try to delete corresponding extractor to free memory?
                    # And reduce number of threads at the end of the loop
                  try:
                    self.q_in[i_q_in].task_done()
                    if deleted_extr[i] == 0:
                      # we pushed the extractor as self.extractors[i] in a loop of self.nb_threads
                      # we use i_q_in
                      del self.extractors[i_q_in]
                      deleted_extr[i] = 1
                  except Exception:
                    pass
                threads_finished[i] = 1
            else:
              if self.verbose > 2:
                # We actually never gave something to process...
                msg = thread_msg+"marked as finished because no data was passed to it"
                print(msg.format(self.pp, i+1, nb_threads_running, threads[i].pid))
              threads_finished[i] = 1

        # Cleanup threads to free memory before getting data back
        # Daemon may still be running...
        # and will actually be deleted only when they exit after not getting a batch
        del threads

        # Gather results
        q_out_size = []
        q_out_size_tot = 0
        for i in range(self.nb_threads):
          q_out_size.append(self.q_out[i].qsize())
          q_out_size_tot += q_out_size[i]

        if self.verbose > 5:
          print("[{}: log] Total output queues size is: {}".format(self.pp, q_out_size_tot))
          sys.stdout.flush()

        # Can get stuck here?
        dict_imgs = dict()
        for i in range(self.nb_threads):
          if self.verbose > 4:
            print("[{}] Thread {} q_out_size: {}".format(self.pp, i+1, q_out_size[i]))
            sys.stdout.flush()
          while q_out_size[i]>0 and not self.q_out[i].empty():
            if self.verbose > 6:
              print("[{}] Thread {} q_out is not empty.".format(self.pp, i + 1))
              sys.stdout.flush()
            try:
              batch_out = self.q_out[i].get(True, 10)
              if self.verbose > 4:
                msg = "[{}] Got batch of {} features from thread {} q_out."
                print(msg.format(self.pp, len(batch_out), i + 1))
                sys.stdout.flush()
              for sha1, dict_out in batch_out:
                dict_imgs[sha1] = dict_out
            except:
              if self.verbose > 1:
                print("[{}] Thread {} failed to get from q_out: {}".format(self.pp, i+1))
                sys.stdout.flush()
              pass
            if self.verbose > 4:
              print("[{}] Marking task done in q_out of thread {}.".format(self.pp, i + 1))
              sys.stdout.flush()
            self.q_out[i].task_done()

        #if self.verbose > 0:
        print_msg = "[{}] Got features for {}/{} images in {}s."
        proc_time = time.time() - start_process
        print(print_msg.format(self.pp, len(dict_imgs.keys()), len(list_in), proc_time))
        sys.stdout.flush()

        # Push them
        self.indexer.push_dict_rows(dict_rows=dict_imgs, table_name=self.indexer.table_sha1infos_name)

        # Mark batch as processed
        update_processed_dict = {update_id: {info_column_family + ':' + update_str_processed: datetime.now().strftime('%Y-%m-%d:%H.%M.%S')}}
        self.indexer.push_dict_rows(dict_rows=update_processed_dict, table_name=self.indexer.table_updateinfos_name)

        # Mark as completed if all rows had an extraction
        if len(rows_batch) == len(dict_imgs.keys()):
          update_completed_dict = {update_id: {info_column_family + ':' + update_str_completed: str(1)}}
          self.indexer.push_dict_rows(dict_rows=update_completed_dict,
                                      table_name=self.indexer.table_updateinfos_name)

        # Cleanup
        del self.q_in
        del self.q_out

        # To try to adjust a too optimistic nb_threads setting
        # if (sum(thread_creation_failed) > 0 or sum(deleted_extr) > 0) and self.nb_threads > 2:
        #   self.nb_threads -= 1

        print_msg = "[{}] Completed update {} in {}s."
        print(print_msg.format(self.pp, update_id, time.time() - start_update))
        sys.stdout.flush()
        self.nb_err = 0

        # Force garbage collection?
        gc.collect()

        # Should we just raise an Exception and restart clean?
        if sum(thread_creation_failed) > 0 or sum(deleted_extr) > 0:
           raise ValueError("Something went wrong. Trying to restart clean")

      except Exception as inst:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fulltb = traceback.format_tb(exc_tb)
        raise type(inst)(" {} ({})".format(inst, ''.join(fulltb)))

  def run(self):
    self.nb_empt = 0
    self.nb_err = 0
    while True:
      self.process_batch()
      print("[ExtractionProcessor: log] Nothing to process at: {}".format(datetime.now().strftime('%Y-%m-%d:%H.%M.%S')))
      sys.stdout.flush()
      time.sleep(10*self.nb_empt)
      self.nb_empt += 1
コード例 #4
0
        "/home/ubuntu/caffe_cpu/build/tools/extract_nfeatures",
        "HBI_host": "10.1.94.57",
        "HBI_table_sha1infos": "escorts_images_sha1_infos_from_ts"
    }

    pyconf = {
        "SBPYCAFFEIMGFEAT_sbcaffe_path":
        "./data/caffe_sentibank_train_iter_250000",
        "SBPYCAFFEIMGFEAT_imgmean_path": "./data/imagenet_mean.npy",
    }

    diffs = []

    rows = []
    if list_sha1s[0]:
        hbi = HBaseIndexerMinimal(conf, prefix="HBI_")
        rows = hbi.get_columns_from_sha1_rows(
            list_sha1s, columns=["info:featnorm_cu", "info:s3_url"])
    sbclif = SentiBankCmdLineImgFeaturizer(conf)
    sbpcif = SentiBankPyCaffeImgFeaturizer(pyconf)

    for row in rows:
        feat_hbase_b64 = featB64decode(row[1]["info:featnorm_cu"])
        #print feat_hbase_b64.shape
        img_url = row[1]["info:s3_url"]
        start_extr = time.time()
        img_buffer = get_buffer_from_URL(img_url)
        feat, data = sbclif.featurize(img_buffer, sha1=row[0])
        img_buffer.seek(0)
        pydata = sbpcif.preprocess_img(img_buffer)
        fpydata = pydata.flatten()
コード例 #5
0
    def __init__(self, global_conf, prefix=DEFAULT_EXTR_PROC_PREFIX):
        """ExtractionProcessor constructor

    :param global_conf_in: configuration file or dictionary
    :type global_conf_in: str, dict
    :param prefix: prefix in configuration
    :type prefix: str
    """
        self.extractor = None
        self.nb_empt = 0
        self.nb_err = 0
        self.max_proc_time = 1200  # in seconds. Increased for sbcmdline...
        self.url_input = True

        super(ExtractionProcessor, self).__init__(global_conf, prefix)

        # TODO: move that to self.read_conf()
        # Get required parameters
        self.input_type = self.get_required_param("input_type")
        self.nb_threads = self.get_required_param("nb_threads")
        self.featurizer_type = self.get_required_param("featurizer_type")
        self.featurizer_prefix = self.get_required_param("featurizer_prefix")
        self.detector_type = self.get_required_param("detector_type")

        # Get optional parameters
        self.verbose = int(self.get_param("verbose", default=0))
        self.ingestion_input = self.get_param("ingestion_input",
                                              default="kafka")
        self.push_back = self.get_param("push_back", default=False)
        file_input = self.get_param("file_input")
        print("[{}.ExtractionProcessor: log] file_input: {}".format(
            self.pp, file_input))
        if file_input:
            self.url_input = False
        print("[{}.ExtractionProcessor: log] url_input: {}".format(
            self.pp, self.url_input))

        # Means we extract feature from the whole image
        if self.detector_type == "full":
            self.detector = None

        self.extr_prefix = build_extr_str(self.featurizer_type,
                                          self.detector_type, self.input_type)
        self.set_pp()

        # Initialize queues
        self.init_queues()

        # Initialize indexer
        # We now have two indexers:
        # - one "in_indexer" for TF table with buffer, img URLs etc...
        # - one "out_indexer" for our table with extractions etc
        # NB: they could be the same if tables are merged...
        self.out_indexer = HBaseIndexerMinimal(
            self.global_conf, prefix=self.get_required_param("indexer_prefix"))
        prefix_in_indexer = self.get_param("in_indexer_prefix", default=False)
        if prefix_in_indexer:
            self.in_indexer = HBaseIndexerMinimal(self.global_conf,
                                                  prefix=prefix_in_indexer)
            insha1tab = self.in_indexer.table_sha1infos_name
            insha1cfs = self.in_indexer.get_dictcf_sha1_table()
            print("[{}] 'in_indexer' sha1 table {} columns are: {}".format(
                self.pp, insha1tab, insha1cfs))
        else:
            print(
                "[{}] empty 'in_indexer_prefix', using out_indexer as in_indexer too."
                .format(self.pp))
            self.in_indexer = self.out_indexer

        # Initialize extractors only once (just one first)
        self.extractors = []
        # DONE: use 'out_indexer'
        self.extractors.append(
            GenericExtractor(self.detector_type, self.featurizer_type,
                             self.input_type, self.out_indexer.extrcf,
                             self.featurizer_prefix, self.global_conf))

        # DONE: use 'in_indexer'
        if self.url_input:
            self.img_column = self.in_indexer.get_col_imgurl()
        else:
            self.img_column = self.in_indexer.get_col_imgpath()
        img_cols = [
            self.in_indexer.get_col_imgbuff(),
            self.in_indexer.get_col_imgurlbak(), self.img_column
        ]
        print("[{}.ExtractionProcessor: log] img_cols: {}".format(
            self.pp, img_cols))

        self.last_update_date_id = ''

        # Initialize ingester
        self.ingester = GenericKafkaProcessor(
            self.global_conf,
            prefix=self.get_required_param("proc_ingester_prefix"))
        self.ingester.pp = "ep"
コード例 #6
0
class ExtractionProcessor(ConfReader):
    """ExtractionProcessor class
  """
    def __init__(self, global_conf, prefix=DEFAULT_EXTR_PROC_PREFIX):
        """ExtractionProcessor constructor

    :param global_conf_in: configuration file or dictionary
    :type global_conf_in: str, dict
    :param prefix: prefix in configuration
    :type prefix: str
    """
        self.extractor = None
        self.nb_empt = 0
        self.nb_err = 0
        self.max_proc_time = 900  # in seconds. Increased for sbcmdline...
        self.url_input = True

        super(ExtractionProcessor, self).__init__(global_conf, prefix)

        # TODO: move that to self.read_conf()
        # Get required parameters
        self.input_type = self.get_required_param("input_type")
        self.nb_threads = self.get_required_param("nb_threads")
        self.featurizer_type = self.get_required_param("featurizer_type")
        self.featurizer_prefix = self.get_required_param("featurizer_prefix")
        self.detector_type = self.get_required_param("detector_type")

        # Get optional parameters
        self.verbose = int(self.get_param("verbose", default=0))
        self.maxucme = int(
            self.get_param("max_up_check_miss_extr",
                           default=MAX_UP_CHECK_MISS_EXTR))
        self.ingestion_input = self.get_param("ingestion_input",
                                              default="kafka")
        self.push_back = self.get_param("push_back", default=False)
        file_input = self.get_param("file_input")
        print("[{}.ExtractionProcessor: log] file_input: {}".format(
            self.pp, file_input))
        if file_input:
            self.url_input = False
        print("[{}.ExtractionProcessor: log] url_input: {}".format(
            self.pp, self.url_input))

        # Means we extract feature from the whole image
        if self.detector_type == "full":
            self.detector = None

        self.extr_prefix = build_extr_str(self.featurizer_type,
                                          self.detector_type, self.input_type)
        self.set_pp()

        # Initialize queues
        self.init_queues()

        # Initialize indexer
        # We now have two indexers:
        # - one "in_indexer" for TF table with buffer, img URLs etc...
        # - one "out_indexer" for our table with extractions etc
        # NB: they could be the same if tables are merged...
        self.out_indexer = HBaseIndexerMinimal(
            self.global_conf, prefix=self.get_required_param("indexer_prefix"))
        self.out_indexer.pp = "ProcOutHBase"
        prefix_in_indexer = self.get_param("in_indexer_prefix", default=False)
        if prefix_in_indexer:
            self.in_indexer = HBaseIndexerMinimal(self.global_conf,
                                                  prefix=prefix_in_indexer)
            self.in_indexer.pp = "ProcInHBase"
            insha1tab = self.in_indexer.table_sha1infos_name
            insha1cfs = self.in_indexer.get_dictcf_sha1_table()
            print("[{}] 'in_indexer' sha1 table {} columns are: {}".format(
                self.pp, insha1tab, insha1cfs))
        else:
            print(
                "[{}] empty 'in_indexer_prefix', using out_indexer as in_indexer too."
                .format(self.pp))
            self.in_indexer = self.out_indexer
            self.in_indexer.pp = "ProcInOutHBase"

        # Initialize extractors only once (just one first)
        self.extractors = []
        # DONE: use 'out_indexer'
        self.extractors.append(
            GenericExtractor(self.detector_type, self.featurizer_type,
                             self.input_type, self.out_indexer.extrcf,
                             self.featurizer_prefix, self.global_conf))

        # DONE: use 'in_indexer'
        if self.url_input:
            self.img_column = self.in_indexer.get_col_imgurl()
        else:
            self.img_column = self.in_indexer.get_col_imgpath()
        img_cols = [
            self.in_indexer.get_col_imgbuff(),
            self.in_indexer.get_col_imgurlbak(), self.img_column
        ]
        print("[{}.ExtractionProcessor: log] img_cols: {}".format(
            self.pp, img_cols))

        self.last_update_date_id = "1970-01-01"
        self.last_missing_extr_date = "1970-01-01"

        # Initialize ingester
        self.ingester = GenericKafkaProcessor(
            self.global_conf,
            prefix=self.get_required_param("proc_ingester_prefix"))
        self.ingester.pp = "ep"

    def set_pp(self, pp="ExtractionProcessor"):
        """Set pretty name

    :param pp: pretty name prefix
    :type pp: str
    """
        self.pp = pp
        if self.extractor:
            self.pp += "_" + self.extr_prefix

    def init_queues(self):
        """Initialize queues list ``self.q_in`` and ``self.q_out``
    """
        from multiprocessing import JoinableQueue
        self.q_in = []
        self.q_out = []
        for _ in range(self.nb_threads):
            self.q_in.append(JoinableQueue(0))
            self.q_out.append(JoinableQueue(0))

    # Should these two methods be in indexer?
    def is_update_unprocessed(self, update_id):
        """Check if an update was not processed yet

    :param update_id: update id
    :type update_id: str
    :return: boolean indicated if update ``update_id`` is unprocessed
    :rtype: bool
    """
        # DONE: use out_indexer
        update_rows = self.out_indexer.get_rows_by_batch(
            [update_id], table_name=self.out_indexer.table_updateinfos_name)
        if update_rows:
            for row in update_rows:
                if self.out_indexer.get_col_upproc() in row[1]:
                    return False
        return True

    def is_update_notstarted(self, update_id, max_delay=None):
        """Check if an update was not started yet

    :param update_id: update id
    :type update_id: str
    :param max_delay: delay (in seconds) between marked start time and now to consider update failed
    :type max_delay: int
    :return: boolean
    :rtype: bool
    """
        # DONE: use out_indexer
        update_rows = self.out_indexer.get_rows_by_batch(
            [update_id], table_name=self.out_indexer.table_updateinfos_name)
        if update_rows:
            for row in update_rows:
                # changed to: self.column_update_started
                #if info_column_family+":"+update_str_started in row[1]:
                #if self.column_update_started in row[1]:
                # DONE: use out_indexer
                if self.out_indexer.get_col_upstart() in row[1]:
                    if max_delay:
                        start_str = row[1][self.out_indexer.get_col_upstart()]
                        # start time format is '%Y-%m-%d:%H.%M.%S'
                        start_dt = datetime.strptime(start_str,
                                                     '%Y-%m-%d:%H.%M.%S')
                        now_dt = datetime.now()
                        diff_dt = now_dt - start_dt
                        if diff_dt.total_seconds() > max_delay:
                            return True
                    return False
        return True

    def get_batch_hbase(self):
        """Get one batch of images from HBase

    :yield: tuple (rows_batch, update_id)
    """
        # legacy implementation: better to have a kafka topic for batches to be processed to allow
        # safe and efficient parallelization on different machines
        # DONE: use in_indexer
        img_cols = [
            self.in_indexer.get_col_imgbuff(),
            self.in_indexer.get_col_imgurlbak(), self.img_column
        ]
        try:
            # DONE: use out_indexer
            for updates in self.out_indexer.get_unprocessed_updates_from_date(
                    self.last_update_date_id, extr_type=self.extr_prefix):
                for update_id, update_cols in updates:
                    if self.extr_prefix in update_id:
                        # double check update has not been processed somewhere else
                        if self.is_update_unprocessed(update_id):
                            # double check update was not marked as started recently i.e. by another process
                            if self.is_update_notstarted(
                                    update_id, max_delay=TIME_ELAPSED_FAILED):
                                # DONE: use out_indexer
                                list_sha1s = update_cols[
                                    self.out_indexer.get_col_listsha1s(
                                    )].split(',')
                                msg = "[{}.get_batch_hbase: log] Update {} has {} images."
                                print(
                                    msg.format(self.pp, update_id,
                                               len(list_sha1s)))
                                # We should time that, it seems slow i.e. 2/3 minutes per update.
                                try:
                                    rows_batch = self.in_indexer.get_columns_from_sha1_rows(
                                        list_sha1s,
                                        rbs=BATCH_SIZE_IMGBUFFER,
                                        columns=img_cols)
                                except Exception:
                                    msg = "[{}.get_batch_hbase: warning] Failed retrieving images data for update: {}"
                                    print(msg.format(self.pp, update_id))
                                    # flush?
                                    sys.stdout.flush()
                                    # Update self.last_update_date_id ?
                                    #self.last_update_date_id = '_'.join(update_id.split('_')[-2:])
                                    continue
                                # print "rows_batch", rows_batch
                                if rows_batch:
                                    yield rows_batch, update_id
                                else:
                                    msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}"
                                    print(msg.format(self.pp, update_id))
                                    #msg = "[{}.get_batch_hbase: log] Was trying to read columns {} from table {} for rows {}"
                                    #print(msg.format(self.pp, img_cols, self.in_indexer.table_sha1infos_name, list_sha1s))
                                # Store last update id
                                self.last_update_date_id = '_'.join(
                                    update_id.split('_')[-2:])
                            else:
                                msg = "[{}.get_batch_hbase: log] Skipping update started recently: {}"
                                print(msg.format(self.pp, update_id))
                                continue
                        else:
                            msg = "[{}.get_batch_hbase: log] Skipping already processed update: {}"
                            print(msg.format(self.pp, update_id))
                            continue
                    else:
                        if self.verbose > 6:
                            msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type."
                            print(msg.format(self.pp, update_id))
            else:
                print("[{}.get_batch_hbase: log] No unprocessed update found.".
                      format(self.pp))
                # Should we reinitialized self.last_update_date_id?
                # Look for updates that have some unprocessed images
                # TODO: wether we do that or not could be specified by a parameter
                # as this induces slow down during update...
                # DONE: use out_indexer
                count_ucme = 0
                stop_cme = False
                for updates in self.out_indexer.get_missing_extr_updates_from_date(
                        self.last_missing_extr_date,
                        extr_type=self.extr_prefix):
                    for update_id, update_cols in updates:
                        if self.extr_prefix in update_id:
                            # DONE: use out_indexer
                            if self.out_indexer.get_col_listsha1s(
                            ) in update_cols:
                                list_sha1s = update_cols[
                                    self.out_indexer.get_col_listsha1s(
                                    )].split(',')
                                msg = "[{}.get_batch_hbase: log] Update {} has {} images missing extractions."
                                print(
                                    msg.format(self.pp, update_id,
                                               len(list_sha1s)))
                                sys.stdout.flush()
                                # also get 'ext:' to check if extraction was already processed?
                                # DONE: use in_indexer
                                rows_batch = self.in_indexer.get_columns_from_sha1_rows(
                                    list_sha1s,
                                    rbs=BATCH_SIZE_IMGBUFFER,
                                    columns=img_cols)
                                if rows_batch:
                                    yield rows_batch, update_id
                                    self.last_missing_extr_date = '_'.join(
                                        update_id.split('_')[-2:])
                                    count_ucme += 1
                                    if count_ucme >= self.maxucme:
                                        stop_cme = True
                                        break
                                else:
                                    msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}"
                                    print(msg.format(self.pp, update_id))
                            else:
                                msg = "[{}.get_batch_hbase: log] Update {} has no images list."
                                print(msg.format(self.pp, update_id))
                        else:
                            msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type."
                            print(msg.format(self.pp, update_id))
                    # We have reached maximum number of check for missing extractions in one call
                    if stop_cme:
                        break
                else:
                    if stop_cme:
                        msg = "[{}.get_batch_hbase: log] Stopped checking updates with missing extractions"
                        msg += "after founding {}/{}."
                        print(
                            msg.format(self.pp, count_ucme, self.maxucme,
                                       self.last_missing_extr_date))
                        msg = "[{}.get_batch_hbase: log] Will restart next time from: {}"
                        print(msg.format(self.pp, self.last_missing_extr_date))
                        sys.stdout.flush()
                    else:
                        msg = "[{}.get_batch_hbase: log] No updates with missing extractions found."
                        print(msg.format(self.pp))
                        sys.stdout.flush()
                        # Re-initialize dates just to make sure we don't miss anything
                        self.last_update_date_id = "1970-01-01"
                        self.last_missing_extr_date = "1970-01-01"

        except Exception as inst:
            # If we reach this point it is really a succession of failures
            full_trace_error("[{}.get_batch_hbase: error] {}".format(
                self.pp, inst))
            # Raise Exception to restart process or docker
            raise inst

    def get_batch_kafka(self):
        """Get one batch of images from Kafka

    :yield: tuple (rows_batch, update_id)
    """
        # Read from a kafka topic to allow safer parallelization on different machines
        # DONE: use in_indexer
        img_cols = [
            self.in_indexer.get_col_imgbuff(),
            self.in_indexer.get_col_imgurlbak(), self.img_column
        ]
        try:
            # Needs to read topic to get update_id and list of sha1s
            if self.ingester.consumer:
                for msg in self.ingester.consumer:
                    msg_dict = json.loads(msg.value)
                    update_id = msg_dict.keys()[0]
                    # NB: Try to get update info and check it was really not processed yet.
                    if self.is_update_unprocessed(update_id):
                        str_list_sha1s = msg_dict[update_id]
                        list_sha1s = str_list_sha1s.split(',')
                        msg = "[{}.get_batch_kafka: log] Update {} has {} images."
                        print(msg.format(self.pp, update_id, len(list_sha1s)))
                        if self.verbose > 3:
                            msg = "[{}.get_batch_kafka: log] Looking for columns: {}"
                            print(msg.format(self.pp, img_cols))
                        # DONE: use in_indexer
                        #rows_batch = self.in_indexer.get_columns_from_sha1_rows(list_sha1s, columns=img_cols)
                        rows_batch = self.in_indexer.get_columns_from_sha1_rows(
                            list_sha1s,
                            rbs=BATCH_SIZE_IMGBUFFER,
                            columns=img_cols)
                        #print "rows_batch", rows_batch
                        if rows_batch:
                            if self.verbose > 4:
                                msg = "[{}.get_batch_kafka: log] Yielding for update: {}"
                                print(msg.format(self.pp, update_id))
                            yield rows_batch, update_id
                            self.ingester.consumer.commit()
                            if self.verbose > 4:
                                msg = "[{}.get_batch_kafka: log] After yielding for update: {}"
                                print(msg.format(self.pp, update_id))
                            self.last_update_date_id = '_'.join(
                                update_id.split('_')[-2:])
                        # Should we try to commit offset only at this point?
                        else:
                            msg = "[{}.get_batch_kafka: log] Did not get any image buffers for the update: {}"
                            print(msg.format(self.pp, update_id))
                    else:
                        msg = "[{}.get_batch_kafka: log] Skipping already processed update: {}"
                        print(msg.format(self.pp, update_id))
                else:
                    print("[{}.get_batch_kafka: log] No update found.".format(
                        self.pp))
                    # Fall back to checking HBase for unstarted/unfinished updates
                    for rows_batch, update_id in self.get_batch_hbase():
                        yield rows_batch, update_id
            else:
                print("[{}.get_batch_kafka: log] No consumer found.".format(
                    self.pp))
                # Fall back to checking HBase for unstarted/unfinished updates
                for rows_batch, update_id in self.get_batch_hbase():
                    yield rows_batch, update_id
        except Exception as inst:
            # If we reach this point it is really a succession of failures
            full_trace_error("[{}.get_batch_kafka: error] {}".format(
                self.pp, inst))
            # Raise Exception to restart process or docker
            raise inst

    def get_batch(self):
        """Get one batch of images

    :yield: tuple (rows_batch, update_id)
    """
        if self.ingestion_input == "hbase":
            for rows_batch, update_id in self.get_batch_hbase():
                yield rows_batch, update_id
        else:
            for rows_batch, update_id in self.get_batch_kafka():
                yield rows_batch, update_id

    def process_batch(self):
        """Process one batch of images

    :raises Exception: if something goes really wrong
    """
        # Get a new update batch
        try:
            for rows_batch, update_id in self.get_batch():
                start_update = time.time()
                img_list_size = len(rows_batch)
                print("[{}] Processing update {} of {} rows.".format(
                    self.pp, update_id, img_list_size))
                sys.stdout.flush()

                # Initialize
                self.nb_empt = 0
                self.init_queues()
                threads = []

                # If we have deleted an extractor at some point or for first batch
                nb_extr_to_create = self.nb_threads - len(self.extractors)
                if nb_extr_to_create:
                    start_create_extractor = time.time()
                    while len(self.extractors) < min(self.nb_threads,
                                                     img_list_size):
                        # DONE: use 'out_indexer'
                        self.extractors.append(
                            GenericExtractor(self.detector_type,
                                             self.featurizer_type,
                                             self.input_type,
                                             self.out_indexer.extrcf,
                                             self.featurizer_prefix,
                                             self.global_conf))
                    msg = "[{}] Created {} extractors in {}s."
                    create_extr_time = time.time() - start_create_extractor
                    print(
                        msg.format(self.pp, len(self.extractors),
                                   create_extr_time))

                # Mark batch as started to be process
                now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S')
                # changed to: self.column_update_started
                #dict_val = {info_column_family + ':' + update_str_started: now_str}
                #dict_val = {self.column_update_started: now_str}
                # DONE: use out_indexer
                dict_val = {self.out_indexer.get_col_upstart(): now_str}
                update_started_dict = {update_id: dict_val}
                # DONE: use out_indexer
                self.out_indexer.push_dict_rows(
                    dict_rows=update_started_dict,
                    table_name=self.out_indexer.table_updateinfos_name)

                # TODO: define a get_buffer_images method
                # --------
                # Push images to queue
                list_in = []
                # For parallelized downloading...
                from Queue import Queue
                nb_imgs_dl = 0
                q_in_dl = Queue(0)
                q_out_dl = Queue(0)

                start_get_buffer = time.time()
                # DONE: use in_indexer in all this scope
                # How could we transfer URL from in table to out table if they are different?...
                for img in rows_batch:
                    # should decode base64
                    #if img_buffer_column in img[1]:
                    if self.in_indexer.get_col_imgbuff() in img[1]:
                        # That's messy...
                        # b64buffer = buffer_to_B64(cStringIO.StringIO(img[1][self.in_indexer.get_col_imgbuff()]))
                        # use img[1].pop(self.in_indexer.get_col_imgbuff())
                        b64buffer = buffer_to_B64(
                            cStringIO.StringIO(img[1].pop(
                                self.in_indexer.get_col_imgbuff())))
                        tup = (img[0], b64buffer, False)
                        list_in.append(tup)
                    else:
                        # need to re-download, accumulate a list of URLs to download
                        if self.verbose > 5:
                            msg = "[{}: log] Will try to download image {} without buffer"
                            print(msg.format(self.pp, img[0]))
                        # Deal with img_path_column for local_images_kafka_pusher
                        if self.img_column in img[1]:
                            q_in_dl.put((img[0], img[1][self.img_column],
                                         self.push_back))
                            nb_imgs_dl += 1
                        elif self.in_indexer.get_col_imgurlbak() in img[1]:
                            q_in_dl.put(
                                (img[0],
                                 img[1][self.in_indexer.get_col_imgurlbak()],
                                 self.push_back))
                            nb_imgs_dl += 1
                        else:
                            msg = "[{}: warning] No buffer and no URL/path for image {} !"
                            print(msg.format(self.pp, img[0]))
                            continue

                # At this point we can delete rows_batch
                del rows_batch
                gc.collect()

                # Download missing images
                nb_dl = 0
                nb_dl_failed = 0
                if nb_imgs_dl > 0:
                    threads_dl = []
                    for i in range(min(self.nb_threads, nb_imgs_dl)):
                        # should read (url, obj_pos) from self.q_in
                        # and push (url, obj_pos, buffer, img_info, start_process, end_process) to self.q_out
                        thread = ThreadedDownloaderBufferOnly(
                            q_in_dl, q_out_dl, url_input=self.url_input)
                        thread.start()
                        threads_dl.append(thread)

                    q_in_dl.join()

                    # Push downloaded images to list_in too
                    while nb_dl < nb_imgs_dl:
                        # This can block?
                        #sha1, buffer, push_back, inst = q_out_dl.get()
                        try:
                            sha1, buffer, push_back, inst = q_out_dl.get(
                                True, 10)
                        except Exception as queue_err:
                            msg = "[{}: error] Download queue out timed out: {}"
                            print(msg.format(self.pp, queue_err))
                            break
                        nb_dl += 1
                        if inst:
                            if self.verbose > 6:
                                msg = "[{}: log] Could not download image {}, error was: {}"
                                print(msg.format(self.pp, sha1, inst))
                            nb_dl_failed += 1
                        else:
                            if buffer:
                                list_in.append((sha1, buffer, push_back))
                            else:
                                # Is that even possible?
                                msg = "[{}: error] No error but no buffer either for image {}"
                                print(msg.format(self.pp, sha1))

                get_buffer_time = time.time() - start_get_buffer
                buff_list_size = len(list_in)
                msg = "[{}] Got {}/{} image buffers ({}/{} downloaded) for update {} in {}s."
                print(
                    msg.format(self.pp, buff_list_size, img_list_size,
                               nb_dl - nb_dl_failed, nb_dl, update_id,
                               get_buffer_time))
                sys.stdout.flush()

                # --------
                # if buff_list_size == 0, we shouldn't try to process anything, just mark update as processed
                if buff_list_size != 0:

                    # TODO: define a get_features method
                    # --------
                    q_batch_size = int(
                        math.ceil(float(buff_list_size) / self.nb_threads))
                    for i, q_batch in enumerate(
                            build_batch(list_in, q_batch_size)):
                        self.q_in[i].put(q_batch)

                    # At this point we can delete list_in
                    del list_in
                    gc.collect()

                    q_in_size = []
                    q_in_size_tot = 0
                    for i in range(self.nb_threads):
                        q_in_size.append(self.q_in[i].qsize())
                        q_in_size_tot += q_in_size[i]
                    if self.verbose > 5:
                        print("[{}] Total input queues sizes is: {}".format(
                            self.pp, q_in_size_tot))

                    # Start daemons...
                    thread_creation_failed = [0] * self.nb_threads
                    for i in range(self.nb_threads):
                        # one per non empty input queue
                        if q_in_size[i] > 0:
                            try:
                                thread = DaemonBatchExtractor(
                                    self.extractors[i],
                                    self.q_in[i],
                                    self.q_out[i],
                                    verbose=self.verbose)
                                # Could get a 'Cannot allocate memory' if we are using too many threads...
                                thread.start()
                                threads.append(thread)
                            except OSError as inst:
                                # Should we try to push self.q_in[i] data to some other thread?
                                msg = "[{}.process_batch: error] Could not start thread #{}: {}"
                                print(msg.format(self.pp, i + 1, inst))
                                thread_creation_failed[i] = 1
                                time.sleep(sum(thread_creation_failed))

                    if sum(thread_creation_failed) == self.nb_threads:
                        # We are in trouble...
                        raise RuntimeError("Could not start any thread...")

                    nb_threads_running = len(threads)
                    deleted_extr = [0] * nb_threads_running
                    start_process = time.time()
                    stop = time.time() + self.max_proc_time
                    # Wait for all tasks to be marked as done
                    threads_finished = [0] * nb_threads_running
                    thread_msg = "[{}] Thread {}/{} (pid: {}) "
                    while sum(threads_finished) < nb_threads_running:
                        for i in range(nb_threads_running):
                            if sum(threads_finished) == nb_threads_running:
                                sys.stdout.flush()
                                break
                            if threads_finished[i] == 1:
                                continue
                            i_q_in = i + sum(thread_creation_failed[:i + 1])
                            if q_in_size[i_q_in] > 0:
                                # This seems to block forever sometimes, if subprocess crashed?...
                                #self.q_in[i].join()
                                # Manual join with timeout...
                                # https://github.com/python/cpython/blob/3.6/Lib/multiprocessing/queues.py
                                if not self.q_in[
                                        i_q_in]._unfinished_tasks._semlock._is_zero(
                                        ) and time.time() < stop:
                                    time.sleep(1)
                                else:
                                    if self.q_in[
                                            i_q_in]._unfinished_tasks._semlock._is_zero(
                                            ):
                                        if self.verbose > 5:
                                            msg = thread_msg + "marked as finished because processing seems finished"
                                            print(
                                                msg.format(
                                                    self.pp, i + 1,
                                                    nb_threads_running,
                                                    threads[i].pid))
                                    else:
                                        # Try to stop processing
                                        threads[i].killed = True
                                        if self.verbose > 0:
                                            # In this cases does this happen...
                                            msg = thread_msg + "killed as max_proc_time ({}) has passed."
                                            print(
                                                msg.format(
                                                    self.pp, i + 1,
                                                    nb_threads_running,
                                                    threads[i].pid,
                                                    self.max_proc_time))
                                            sys.stdout.flush()
                                            # Try to delete corresponding extractor to free memory?
                                            # And reduce number of threads at the end of the loop
                                        try:
                                            # This can block?
                                            #self.q_in[i_q_in].task_done()
                                            if deleted_extr[i] == 0:
                                                # we pushed the extractor as self.extractors[i] in a loop of self.nb_threads
                                                # we use i_q_in
                                                #del self.extractors[i_q_in]
                                                deleted_extr[i] = 1
                                                del self.extractors[
                                                    i_q_in -
                                                    sum(deleted_extr[:i + 1])]
                                                #del threads[i - sum(deleted_extr[:i+1])]
                                        except Exception:
                                            pass
                                    threads_finished[i] = 1
                            else:
                                if self.verbose > 2:
                                    # We actually never gave something to process...
                                    msg = thread_msg + "marked as finished because no data was passed to it"
                                    print(
                                        msg.format(self.pp, i + 1,
                                                   nb_threads_running,
                                                   threads[i].pid))
                                threads_finished[i] = 1

                    # Cleanup threads to free memory before getting data back
                    # Daemon may still be running...
                    # and will actually be deleted only when they exit after not getting a batch
                    del threads

                    # Gather results
                    q_out_size = []
                    q_out_size_tot = 0
                    for i in range(self.nb_threads):
                        q_out_size.append(self.q_out[i].qsize())
                        q_out_size_tot += q_out_size[i]

                    if self.verbose > 5:
                        print(
                            "[{}: log] Total output queues size is: {}".format(
                                self.pp, q_out_size_tot))
                        sys.stdout.flush()

                    # Can get stuck here?
                    dict_imgs = dict()
                    if q_out_size_tot > 0:
                        for i in range(self.nb_threads):
                            if self.verbose > 4:
                                print("[{}] Thread {} q_out_size: {}".format(
                                    self.pp, i + 1, q_out_size[i]))
                                sys.stdout.flush()
                            while q_out_size[i] > 0 and not self.q_out[
                                    i].empty():
                                if self.verbose > 6:
                                    print("[{}] Thread {} q_out is not empty.".
                                          format(self.pp, i + 1))
                                    sys.stdout.flush()
                                try:
                                    # This can still block forever?
                                    #batch_out = self.q_out[i].get(True, 10)
                                    batch_out = self.q_out[i].get_nowait()
                                    if self.verbose > 4:
                                        msg = "[{}] Got batch of {} features from thread {} q_out."
                                        print(
                                            msg.format(self.pp, len(batch_out),
                                                       i + 1))
                                        sys.stdout.flush()
                                    for sha1, dict_out in batch_out:
                                        dict_imgs[sha1] = dict_out
                                except:
                                    if self.verbose > 1:
                                        print(
                                            "[{}] Thread {} failed to get from q_out: {}"
                                            .format(self.pp, i + 1))
                                        sys.stdout.flush()
                                    #pass
                                if self.verbose > 4:
                                    print(
                                        "[{}] Marking task done in q_out of thread {}."
                                        .format(self.pp, i + 1))
                                    sys.stdout.flush()
                                self.q_out[i].task_done()

                    #if self.verbose > 0:
                    print_msg = "[{}] Got features for {}/{} images in {}s."
                    proc_time = time.time() - start_process
                    print(
                        print_msg.format(self.pp, len(dict_imgs.keys()),
                                         buff_list_size, proc_time))
                    sys.stdout.flush()
                    # --------

                    if len(dict_imgs.keys()) > 0:
                        # Push computed features
                        self.out_indexer.push_dict_rows(
                            dict_rows=dict_imgs,
                            table_name=self.out_indexer.table_sha1infos_name)

                else:
                    msg = "[{}: Warning] Could not get any image buffer (out of {} requested) for update {}"
                    print(msg.format(self.pp, img_list_size, update_id))
                    dict_imgs = dict()
                    nb_threads_running = len(threads)
                    thread_creation_failed = [0] * self.nb_threads
                    deleted_extr = [0] * nb_threads_running

                # Mark batch as processed
                now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S')
                # DONE: use out_indexer
                update_processed_dict = {
                    update_id: {
                        self.out_indexer.get_col_upproc(): now_str
                    }
                }
                self.out_indexer.push_dict_rows(
                    dict_rows=update_processed_dict,
                    table_name=self.out_indexer.table_updateinfos_name)

                # Mark as completed if all rows had an extraction
                if img_list_size == len(dict_imgs.keys()):
                    # DONE: use out_indexer
                    update_completed_dict = {
                        update_id: {
                            self.out_indexer.get_col_upcomp(): str(1)
                        }
                    }
                    self.out_indexer.push_dict_rows(
                        dict_rows=update_completed_dict,
                        table_name=self.out_indexer.table_updateinfos_name)

                # Cleanup
                del self.q_in
                del self.q_out

                # To try to adjust a too optimistic nb_threads setting
                # if (sum(thread_creation_failed) > 0 or sum(deleted_extr) > 0) and self.nb_threads > 2:
                #   self.nb_threads -= 1

                msg = "[{}] Completed update {} in {}s."
                print(
                    msg.format(self.pp, update_id,
                               time.time() - start_update))
                sys.stdout.flush()
                self.nb_err = 0

                # Force garbage collection
                gc.collect()

                # Should we just raise an Exception and restart clean?
                if sum(thread_creation_failed) > 0 or sum(deleted_extr) > 0:
                    # To try to adjust a too optimistic nb_threads setting
                    if self.nb_threads > 1:
                        self.nb_threads -= 1
                        self.extractors = []
                        gc.collect()
                    else:
                        raise RuntimeError(
                            "Processed failed with a single thread...")

        except Exception as inst:
            #exc_type, exc_obj, exc_tb = sys.exc_info()
            #fulltb = traceback.format_tb(exc_tb)
            print("[{}.process_batch: ERROR] {}".format(self.pp, inst))
            #print("[{}] {} ({})".format(self.pp, inst, ''.join(fulltb)))
            # Things are likely to be very bad at that point... Docker should be restarted
            #if self.nb_threads == 2:
            raise inst
            #raise type(inst)(" {} ({})".format(inst, ''.join(fulltb)))

    def run(self):
        """Run processor
    """
        self.nb_empt = 0
        self.nb_err = 0
        while True:
            self.process_batch()
            msg = "[ExtractionProcessor: log] Nothing to process at: {}"
            print(msg.format(datetime.now().strftime('%Y-%m-%d:%H.%M.%S')))
            sys.stdout.flush()
            time.sleep(10 * min(self.nb_empt, 60))
            self.nb_empt += 1
コード例 #7
0
  def __init__(self, global_conf, prefix=DEFAULT_EXTR_CHECK_PREFIX, pid=None):
    """ExtractionChecker constructor

    :param global_conf_in: configuration file or dictionary
    :type global_conf_in: str, dict
    :param prefix: prefix in configuration
    :type prefix: str
    :param pid: process id
    :type pid: int
    """
    self.list_extr_prefix = []
    self.pid = pid
    self.dict_sha1_infos = dict()

    super(ExtractionChecker, self).__init__(global_conf, prefix)

    self.last_push = time.time()
    self.nb_imgs_check = 0
    self.nb_imgs_unproc = 0
    self.nb_imgs_unproc_lastprint = 0

    self.featurizer_type = self.get_required_param("featurizer_type")
    self.detector_type = self.get_required_param("detector_type")
    self.input_type = self.get_required_param("input_type")

    # Max delay
    self.max_delay = int(self.get_param("max_delay", default=3600))

    self.list_extr_prefix = [self.featurizer_type, "feat", self.detector_type, self.input_type]
    self.extr_prefix = "_".join(self.list_extr_prefix)
    self.batch_check_column = None
    self.check_columns = []

    # changed to: get column family from indexer in set_check_columns
    # Need to be build from extraction type and detection input + "_processed"
    #self.extr_family_column = self.get_param("extr_family_column", default="ext")
    # self.extr_prefix_base_column_name = self.extr_family_column + ":" + self.extr_prefix
    # self.extr_check_column = self.extr_prefix_base_column_name + "_processed"
    # # Need to be build from extraction type and extraction input + "_batchid"
    # self.batch_check_column = self.extr_prefix_base_column_name + "_updateid"
    # self.check_columns = [self.extr_check_column, self.batch_check_column]

    self.set_pp()

    # Initialize indexer
    self.indexer = HBaseIndexerMinimal(self.global_conf,
                                       prefix=self.get_required_param("indexer_prefix"))
    self.indexer.pp = "CheckerHBase"
    print(self.get_required_param("indexer_prefix"), self.indexer.get_dictcf_sha1_table())
    self.set_check_columns()
    print(self.check_columns)
    # Initialize ingester
    try:
      self.ingester = GenericKafkaProcessor(self.global_conf,
                                            prefix=self.get_required_param("check_ingester_prefix"))
    except Exception as inst:
      # print "Could not initialize checker, sleeping for {}s.".format(self.max_delay)
      # time.sleep(self.max_delay)
      # raise(inst)
      #print("Could not initialize 'updates_out_topic' ({}). Will write only to HBase.".format(inst))
      print("[{}: ERROR] Could not start ingester.".format(self.pp, inst))
      raise inst
    # This will not be set for HBase processing, but checker would keep dying here...
    self.updates_out_topic = None
    try:
      self.updates_out_topic = self.ingester.get_required_param("producer_updates_out_topic")
    except Exception as inst:
      # print "Could not initialize checker, sleeping for {}s.".format(self.max_delay)
      # time.sleep(self.max_delay)
      # raise(inst)
      #print("Could not initialize 'updates_out_topic' ({}). Will write only to HBase.".format(inst))
      print("{}. Will write only to HBase.".format(inst))

    self.ingester.pp = "ec"
    if self.pid:
      self.ingester.pp += str(self.pid)
コード例 #8
0
class ExtractionChecker(ConfReader):
  """ExtractionChecker class
  """

  def __init__(self, global_conf, prefix=DEFAULT_EXTR_CHECK_PREFIX, pid=None):
    """ExtractionChecker constructor

    :param global_conf_in: configuration file or dictionary
    :type global_conf_in: str, dict
    :param prefix: prefix in configuration
    :type prefix: str
    :param pid: process id
    :type pid: int
    """
    self.list_extr_prefix = []
    self.pid = pid
    self.dict_sha1_infos = dict()

    super(ExtractionChecker, self).__init__(global_conf, prefix)

    self.last_push = time.time()
    self.nb_imgs_check = 0
    self.nb_imgs_unproc = 0
    self.nb_imgs_unproc_lastprint = 0

    self.featurizer_type = self.get_required_param("featurizer_type")
    self.detector_type = self.get_required_param("detector_type")
    self.input_type = self.get_required_param("input_type")

    # Max delay
    self.max_delay = int(self.get_param("max_delay", default=3600))

    self.list_extr_prefix = [self.featurizer_type, "feat", self.detector_type, self.input_type]
    self.extr_prefix = "_".join(self.list_extr_prefix)
    self.batch_check_column = None
    self.check_columns = []

    # changed to: get column family from indexer in set_check_columns
    # Need to be build from extraction type and detection input + "_processed"
    #self.extr_family_column = self.get_param("extr_family_column", default="ext")
    # self.extr_prefix_base_column_name = self.extr_family_column + ":" + self.extr_prefix
    # self.extr_check_column = self.extr_prefix_base_column_name + "_processed"
    # # Need to be build from extraction type and extraction input + "_batchid"
    # self.batch_check_column = self.extr_prefix_base_column_name + "_updateid"
    # self.check_columns = [self.extr_check_column, self.batch_check_column]

    self.set_pp()

    # Initialize indexer
    self.indexer = HBaseIndexerMinimal(self.global_conf,
                                       prefix=self.get_required_param("indexer_prefix"))
    self.indexer.pp = "CheckerHBase"
    print(self.get_required_param("indexer_prefix"), self.indexer.get_dictcf_sha1_table())
    self.set_check_columns()
    print(self.check_columns)
    # Initialize ingester
    try:
      self.ingester = GenericKafkaProcessor(self.global_conf,
                                            prefix=self.get_required_param("check_ingester_prefix"))
    except Exception as inst:
      # print "Could not initialize checker, sleeping for {}s.".format(self.max_delay)
      # time.sleep(self.max_delay)
      # raise(inst)
      #print("Could not initialize 'updates_out_topic' ({}). Will write only to HBase.".format(inst))
      print("[{}: ERROR] Could not start ingester.".format(self.pp, inst))
      raise inst
    # This will not be set for HBase processing, but checker would keep dying here...
    self.updates_out_topic = None
    try:
      self.updates_out_topic = self.ingester.get_required_param("producer_updates_out_topic")
    except Exception as inst:
      # print "Could not initialize checker, sleeping for {}s.".format(self.max_delay)
      # time.sleep(self.max_delay)
      # raise(inst)
      #print("Could not initialize 'updates_out_topic' ({}). Will write only to HBase.".format(inst))
      print("{}. Will write only to HBase.".format(inst))

    self.ingester.pp = "ec"
    if self.pid:
      self.ingester.pp += str(self.pid)

  def set_check_columns(self):
    """Set columns to be checked in indexer
    """
    # changed to: get column family from indexer
    extr_prefix_base_column_name = self.indexer.extrcf + ":" + self.extr_prefix
    extr_check_column = extr_prefix_base_column_name + "_processed"
    # Need to be build from extraction type and extraction input + "_batchid"
    self.batch_check_column = extr_prefix_base_column_name + "_updateid"
    self.check_columns = [extr_check_column, self.batch_check_column]
    #print(self.check_columns)


  def set_pp(self, pp=""):
    """Set pretty name
    """
    self.pp = "ExtractionChecker"
    self.pp += "-".join(self.list_extr_prefix)
    if self.pid:
      self.pp += "." + str(self.pid)

  def store_img_infos(self, msg):
    """Store information about the images of ``msg`` in ``self.dict_sha1_infos``

    :param msg: Kafka record
    :type msg: collections.namedtuple
    """
    # msg is technically a ConsumerRecord that is a collections.namedtuple, see:
    # https://github.com/dpkp/kafka-python/blob/master/kafka/consumer/fetcher.py#L30
    strk = str(msg['sha1'])
    self.dict_sha1_infos[strk] = dict()
    for key in msg:
      # dumps json of 'img_info'
      # We actually need that only for DIG...
      if key == "img_info":
        self.dict_sha1_infos[strk][key] = json.dumps(msg[key])
      else:
        # discard 'img_buffer' (if it exists?...), and 'sha1'
        # if k != "img_buffer" and k != "sha1":
        #  self.dict_sha1_infos[strk][k] = msg[k]
        # discard 'sha1'
        if key != "sha1":
          self.dict_sha1_infos[strk][key] = msg[key]

  def cleanup_dict_infos(self, list_del_sha1s):
    """Remove images ``list_del_sha1s`` from ``self.dict_sha1_infos``

    :param list_del_sha1s: list of images sha1 to remove
    :type list_del_sha1s: list
    """
    for sha1 in list_del_sha1s:
      try:
        del self.dict_sha1_infos[str(sha1)]
      except:
        # could happen when cleaning up duplicates or image processed by another process
        pass

  def get_dict_push(self, list_get_sha1s, daemon=False):
    """Get dictionary to be pushed to HBase for images in ``list_get_sha1s``

    :param list_get_sha1s: list of images
    :type list_get_sha1s: list
    :param daemon: whether the checker is running in daemon mode
    :type daemon: bool
    :return: (dict_push, update_id)
    :rtype: tuple
    """
    #TODO: is this needed for every get_dict_push call?
    self.set_check_columns()
    # TODO: also pass current update_id, and move the creation of update id out of this method
    #  this method should actually be used to 'claim' an image as soon as we can.
    dict_push = dict()
    # append processid to 'update_id' for safe use with multiple consumers, even after restart
    # /!\ beware, it should not contain underscores
    tmp_update_id, _ = self.indexer.get_next_update_id(today=None, extr_type=self.extr_prefix)
    update_id = tmp_update_id + '-' + self.ingester.pp + '-' + str(time.time())
    for sha1 in list_get_sha1s:
      dict_push[str(sha1)] = dict()
      try:
        tmp_dict = self.dict_sha1_infos[str(sha1)]
      except:
        # This would mean the image has been marked as part of another batch by another process,
        # and thus deleted in a previous 'get_unprocessed_rows' call
        # This is also only relevant if we run on Daemon mode...
        # TODO: for transition we won't really have any info to push except the update_id...
        if daemon:
          del dict_push[str(sha1)]
          continue
      # build column names properly i.e. appending 'info:'
      for key in tmp_dict:
        # changed to: use column_family from indexer
        # But the use of 'key' here also means we rely on the input to define column name...
        #dict_push[str(sha1)]['info:' + key] = tmp_dict[key]
        dict_push[str(sha1)][self.indexer.imginfocf + ':' + key] = tmp_dict[key]
      dict_push[str(sha1)][self.batch_check_column] = update_id
    return dict_push, update_id

  def get_unprocessed_rows(self, list_check_sha1s):
    """Get the subset of the list of sha1s ``list_check_sha1s`` that have not been processed yet

    :param list_check_sha1s: list of images sha1 to check
    :type list_check_sha1s: list
    :return: set of unprocessed images
    :rtype: set
    """
    # TODO: also pass current update_id and only delete if != from current update...

    unprocessed_rows = set(list_check_sha1s)

    if list_check_sha1s:
      # Check if the selected sha1 rows in HBase table 'sha1infos' have those check_column
      # This call will only return rows that DO have those check_column
      fam = self.indexer.get_dictcf_sha1_table()
      try:
        sha1s_rows = self.indexer.get_columns_from_sha1_rows(list_check_sha1s, self.check_columns,
                                                             families=fam)
      except Exception as inst:
        print("[{}.get_unprocessed_rows: log] fam: {}".format(self.pp, fam))
        raise inst

          #families=self.tablesha1_col_families)
      if sha1s_rows:
        # TODO: only delete if really previously processed, i.e. if != from current update...
        found_sha1_rows = set([str(row[0]) for row in sha1s_rows])
        # Clean up 'dict_sha1_infos' deleting found_sha1_rows
        self.cleanup_dict_infos(found_sha1_rows)
        set_list_check_sha1s = set(list_check_sha1s)
        # TODO: but we should not re-add them, so we should discard them from unprocessed_rows
        unprocessed_rows = set_list_check_sha1s - found_sha1_rows

    return unprocessed_rows

  def run(self, daemon=False):
    """Run extraction checker

    :param daemon: whether we are running in daemon mode
    :type daemon: bool
    :raises Exception: if check fails
    """
    i = 0
    try:
      list_sha1s_to_process = []
      # TODO: create update_id here

      while True:
        list_check_sha1s = []

        try:
          # Accumulate images infos
          for msg_json in self.ingester.consumer:
            msg = json.loads(msg_json.value)
            # i += 1
            # print((i, len(list_check_sha1s), msg))

            # msg could now contain keys 'sha1' or 'list_sha1s'
            # should we check that we can't have both or other keys?...
            if 'sha1' in msg:
              list_check_sha1s.append(str(msg['sha1']))
              # Store other fields to be able to push them too
              self.store_img_infos(msg)
            elif 'list_sha1s' in msg:
              for sha1 in msg['list_sha1s']:
                list_check_sha1s.append(str(sha1))
                # We won't have any additional infos no?
                # But we should still build a dict for each sample for consistency...
                tmp_dict = dict()
                tmp_dict['sha1'] = str(sha1)
                # will basically push an empty dict to self.dict_sha1_infos, so self.get_dict_push
                # works properly later on...
                self.store_img_infos(tmp_dict)
            else:
              print('Unknown keys in msg: {}'.format(msg.keys()))

            if len(list_check_sha1s) >= self.indexer.batch_update_size:
              break
        except Exception as inst:
          # trying to use 'consumer_timeout_ms' to raise timeout and get last samples
          msg = "[{}: warning] At {}, caught {} {} in consumer loop"
          now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S')
          print(msg.format(self.pp, now_str, type(inst), inst))
          sys.stdout.flush()

        if not list_check_sha1s:
          # TODO: should we fallback to scanning Hbase table here?
          continue

        # Check which images have not been processed (or pushed in an update) yet
        unprocessed_rows = self.get_unprocessed_rows(list_check_sha1s)
        self.nb_imgs_check += len(list_check_sha1s)
        push_delay = (time.time() - self.last_push) > self.max_delay / 60
        if push_delay and self.nb_imgs_unproc_lastprint != self.nb_imgs_unproc:
          msg = "[{}: log] Found {}/{} unprocessed images"
          print(msg.format(self.pp, self.nb_imgs_unproc, self.nb_imgs_check))
          self.nb_imgs_unproc_lastprint = self.nb_imgs_unproc

        # TODO: we should mark those images as being 'owned' by the update we are constructing
        # (only important if we are running multiple threads i.e. daemon is True)
        # otherwise another update running at the same time could also claim it (in another ad)
        # could be handle when adding data to the searcher but duplicates in extraction process...

        # Push sha1s to be processed
        for sha1 in unprocessed_rows:
          list_sha1s_to_process.append(sha1)

        # Remove potential duplicates
        list_sha1s_to_process = list(set(list_sha1s_to_process))

        if list_sha1s_to_process:
          # Push them to HBase by batch of 'batch_update_size'
          push_delay = (time.time() - self.last_push) > self.max_delay
          full_batch = len(list_sha1s_to_process) >= self.indexer.batch_update_size
          if full_batch or (push_delay and list_sha1s_to_process):
            # Trim here to push exactly a batch of 'batch_update_size'
            list_push = list_sha1s_to_process[:min(self.indexer.batch_update_size,
                                                   len(list_sha1s_to_process))]

            # TODO: this should be done before,
            # to 'claim' the images as soon as we plan to process them for this update
            # Gather corresponding sha1 infos
            dict_push, update_id = self.get_dict_push(list_push, daemon=daemon)
            if dict_push:
              self.nb_imgs_unproc += len(dict_push.keys())
              msg = "[{}: at {}] Pushing update {} of {} images."
              now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S')
              print(msg.format(self.pp, now_str, update_id, len(dict_push.keys())))
              sys.stdout.flush()

              # Push images
              fam = self.indexer.get_dictcf_sha1_table()
              if self.verbose > 4:
                msg = "[{}] Pushing images for update {} with fam {}"
                print(msg.format(self.pp, update_id, fam))
              sha1s_table = self.indexer.table_sha1infos_name
              self.indexer.push_dict_rows(dict_push, sha1s_table, families=fam)

              # Build HBase updates dict
              dict_updates_db = dict()
              now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S')
              list_sha1s_col = self.indexer.get_col_listsha1s()
              dict_updates_db[update_id] = {list_sha1s_col: ','.join(dict_push.keys()),
                                            self.indexer.get_col_upcreate(): now_str}
              # Push it
              fam = self.indexer.get_dictcf_update_table()
              if self.verbose > 4:
                msg = "[{}] Pushing update {} info with fam {}"
                print(msg.format(self.pp, update_id, fam))
              self.indexer.push_dict_rows(dict_updates_db, self.indexer.table_updateinfos_name,
                                          families=fam)

              # Build HBase updates dict
              if self.updates_out_topic is not None:
                dict_updates_kafka = dict()
                dict_updates_kafka[update_id] = ','.join(dict_push.keys())
                # Push it
                self.ingester.producer.send(self.updates_out_topic, json.dumps(dict_updates_kafka))

              # Gather any remaining sha1s and clean up infos
              if len(list_sha1s_to_process) > self.indexer.batch_update_size:
                list_sha1s_to_process = list_sha1s_to_process[self.indexer.batch_update_size:]
              else:
                list_sha1s_to_process = []
              # if duplicates wrt list_push, remove them. Can this still happen?
              list_sha1s_to_process = [sh1 for sh1 in list_sha1s_to_process if sh1 not in list_push]
              self.cleanup_dict_infos(list_push)
            else:
              msg = "[{}: at {}] Nothing to push for update {}"
              print(msg.format(self.pp, datetime.now().strftime('%Y-%m-%d:%H.%M.%S'), update_id))
              sys.stdout.flush()
            self.last_push = time.time()
            # TODO: we should create a new update_id here,
            # and let it claim the potential remaining images in 'list_sha1s_to_process'
            # sanity check that len(list_sha1s_to_process) == len(self.dict_sha1_infos) ?

    except Exception as inst:
      exc_type, exc_obj, exc_tb = sys.exc_info()
      fulltb = traceback.format_tb(exc_tb)
      raise type(inst)(" {} ({})".format(inst, ''.join(fulltb)))
コード例 #9
0
    def __init__(self,
                 global_conf,
                 prefix=DEFAULT_EXTR_CHECK_PREFIX,
                 pid=None):
        """ExtractionChecker constructor

    :param global_conf_in: configuration file or dictionary
    :type global_conf_in: str, dict
    :param prefix: prefix in configuration
    :type prefix: str
    :param pid: process id
    :type pid: int
    """
        self.list_extr_prefix = []
        self.pid = pid
        self.dict_sha1_infos = dict()

        super(ExtractionChecker, self).__init__(global_conf, prefix)

        self.last_push = time.time()
        self.nb_imgs_check = 0
        self.nb_imgs_unproc = 0
        self.nb_imgs_unproc_lastprint = 0

        self.featurizer_type = self.get_required_param("featurizer_type")
        self.detector_type = self.get_required_param("detector_type")
        self.input_type = self.get_required_param("input_type")

        # Max delay
        self.max_delay = int(
            self.get_param("max_delay", default=DEFAULT_MAX_DELAY))
        self.min_len_check = int(
            self.get_param("min_len_check", default=DEFAULT_MIN_LENGTH_CHECK))

        self.list_extr_prefix = [
            self.featurizer_type, "feat", self.detector_type, self.input_type
        ]
        self.extr_prefix = "_".join(self.list_extr_prefix)
        self.batch_check_column = None
        self.check_columns = []

        # changed to: get column family from indexer in set_check_columns
        # Need to be build from extraction type and detection input + "_processed"
        #self.extr_family_column = self.get_param("extr_family_column", default="ext")
        # self.extr_prefix_base_column_name = self.extr_family_column + ":" + self.extr_prefix
        # self.extr_check_column = self.extr_prefix_base_column_name + "_processed"
        # # Need to be build from extraction type and extraction input + "_batchid"
        # self.batch_check_column = self.extr_prefix_base_column_name + "_updateid"
        # self.check_columns = [self.extr_check_column, self.batch_check_column]

        self.set_pp()

        # Initialize indexer
        self.indexer = HBaseIndexerMinimal(
            self.global_conf, prefix=self.get_required_param("indexer_prefix"))
        self.indexer.pp = "CheckerHBase"
        print(self.get_required_param("indexer_prefix"),
              self.indexer.get_dictcf_sha1_table())
        self.set_check_columns()
        print(self.check_columns)

        # Initialize ingester, that could now be Kafka or Kinesis. Should we have a default?
        ingester_type = self.get_required_param("image_ingestion_type")
        try:
            if ingester_type == "kafka":
                self.ingester = KafkaIngester(
                    self.global_conf,
                    prefix=self.get_required_param("check_ingester_prefix"))
            elif ingester_type == "kinesis":
                self.ingester = KinesisIngester(
                    self.global_conf,
                    prefix=self.get_required_param("check_ingester_prefix"))
            else:
                raise ValueError(
                    "Unknown 'ingester_type': {}".format(ingester_type))
        except Exception as inst:
            # print "Could not initialize checker, sleeping for {}s.".format(self.max_delay)
            # time.sleep(self.max_delay)
            # raise(inst)
            #print("Could not initialize 'updates_out_topic' ({}). Will write only to HBase.".format(inst))
            print("[{}: ERROR] Could not start ingester.".format(
                self.pp, inst))
            raise inst

        # Initialize producer
        # TODO: also check for 'update_ingestion_type' as producer_type?
        producer_type = self.get_param("update_ingestion_type",
                                       DEFAULT_UPDATE_INGESTION_TYPE)
        # TODO: create a producer if 'update_ingestion_type' is Kinesis or Kafka
        # if producer_type != "hbase":
        #   self.updates_out_topic = self.ingester.get_required_param("producer_updates_out_topic")
        if producer_type == "kafka":
            self.pusher = KafkaPusher(
                self.global_conf,
                prefix=self.get_required_param("check_ingester_prefix"))
        elif producer_type == "kinesis":
            self.pusher = KinesisPusher(
                self.global_conf,
                prefix=self.get_required_param("check_ingester_prefix"))
        elif producer_type == "hbase":
            self.pusher = None
            print("[{}: log] Will write updates only to HBase.".format(
                self.pp))
        else:
            raise ValueError(
                "Unknown 'producer_type': {}".format(producer_type))
コード例 #10
0
class ExtractionChecker(ConfReader):
    """ExtractionChecker class
  """
    def __init__(self,
                 global_conf,
                 prefix=DEFAULT_EXTR_CHECK_PREFIX,
                 pid=None):
        """ExtractionChecker constructor

    :param global_conf_in: configuration file or dictionary
    :type global_conf_in: str, dict
    :param prefix: prefix in configuration
    :type prefix: str
    :param pid: process id
    :type pid: int
    """
        self.list_extr_prefix = []
        self.pid = pid
        self.dict_sha1_infos = dict()

        super(ExtractionChecker, self).__init__(global_conf, prefix)

        self.last_push = time.time()
        self.nb_imgs_check = 0
        self.nb_imgs_unproc = 0
        self.nb_imgs_unproc_lastprint = 0

        self.featurizer_type = self.get_required_param("featurizer_type")
        self.detector_type = self.get_required_param("detector_type")
        self.input_type = self.get_required_param("input_type")

        # Max delay
        self.max_delay = int(
            self.get_param("max_delay", default=DEFAULT_MAX_DELAY))
        self.min_len_check = int(
            self.get_param("min_len_check", default=DEFAULT_MIN_LENGTH_CHECK))

        self.list_extr_prefix = [
            self.featurizer_type, "feat", self.detector_type, self.input_type
        ]
        self.extr_prefix = "_".join(self.list_extr_prefix)
        self.batch_check_column = None
        self.check_columns = []

        # changed to: get column family from indexer in set_check_columns
        # Need to be build from extraction type and detection input + "_processed"
        #self.extr_family_column = self.get_param("extr_family_column", default="ext")
        # self.extr_prefix_base_column_name = self.extr_family_column + ":" + self.extr_prefix
        # self.extr_check_column = self.extr_prefix_base_column_name + "_processed"
        # # Need to be build from extraction type and extraction input + "_batchid"
        # self.batch_check_column = self.extr_prefix_base_column_name + "_updateid"
        # self.check_columns = [self.extr_check_column, self.batch_check_column]

        self.set_pp()

        # Initialize indexer
        self.indexer = HBaseIndexerMinimal(
            self.global_conf, prefix=self.get_required_param("indexer_prefix"))
        self.indexer.pp = "CheckerHBase"
        print(self.get_required_param("indexer_prefix"),
              self.indexer.get_dictcf_sha1_table())
        self.set_check_columns()
        print(self.check_columns)

        # Initialize ingester, that could now be Kafka or Kinesis. Should we have a default?
        ingester_type = self.get_required_param("image_ingestion_type")
        try:
            if ingester_type == "kafka":
                self.ingester = KafkaIngester(
                    self.global_conf,
                    prefix=self.get_required_param("check_ingester_prefix"))
            elif ingester_type == "kinesis":
                self.ingester = KinesisIngester(
                    self.global_conf,
                    prefix=self.get_required_param("check_ingester_prefix"))
            else:
                raise ValueError(
                    "Unknown 'ingester_type': {}".format(ingester_type))
        except Exception as inst:
            # print "Could not initialize checker, sleeping for {}s.".format(self.max_delay)
            # time.sleep(self.max_delay)
            # raise(inst)
            #print("Could not initialize 'updates_out_topic' ({}). Will write only to HBase.".format(inst))
            print("[{}: ERROR] Could not start ingester.".format(
                self.pp, inst))
            raise inst

        # Initialize producer
        # TODO: also check for 'update_ingestion_type' as producer_type?
        producer_type = self.get_param("update_ingestion_type",
                                       DEFAULT_UPDATE_INGESTION_TYPE)
        # TODO: create a producer if 'update_ingestion_type' is Kinesis or Kafka
        # if producer_type != "hbase":
        #   self.updates_out_topic = self.ingester.get_required_param("producer_updates_out_topic")
        if producer_type == "kafka":
            self.pusher = KafkaPusher(
                self.global_conf,
                prefix=self.get_required_param("check_ingester_prefix"))
        elif producer_type == "kinesis":
            self.pusher = KinesisPusher(
                self.global_conf,
                prefix=self.get_required_param("check_ingester_prefix"))
        elif producer_type == "hbase":
            self.pusher = None
            print("[{}: log] Will write updates only to HBase.".format(
                self.pp))
        else:
            raise ValueError(
                "Unknown 'producer_type': {}".format(producer_type))
        #self.ingester.pp = self.get_param("pp", "ImageIngester")

        # Only if daemon mode, as we may have multiple ingesters
        # But for Kinesis the `shard_infos_filename` may not be re-used...
        #if self.pid:
        #  self.ingester.pp += str(self.pid)

    def set_check_columns(self):
        """Set columns to be checked in indexer
    """
        # changed to: get column family from indexer
        # TODO: get the suffixes as global variables maybe from common.defaults
        extr_prefix_base_column_name = self.indexer.extrcf + ":" + self.extr_prefix
        extr_check_column = extr_prefix_base_column_name + "_processed"
        # Need to be build from extraction type and extraction input + "_batchid"
        self.batch_check_column = extr_prefix_base_column_name + "_updateid"
        self.check_columns = [extr_check_column, self.batch_check_column]
        #print(self.check_columns)

    def set_pp(self, pp=""):
        """Set pretty name
    """
        self.pp = "ExtractionChecker"
        self.pp += "-".join(self.list_extr_prefix)
        if self.pid:
            self.pp += "." + str(self.pid)

    def store_img_infos(self, msg):
        """Store information about the images of ``msg`` in ``self.dict_sha1_infos``

    :param msg: message
    :type msg: dict
    """
        strk = str(msg['sha1']).upper()
        self.dict_sha1_infos[strk] = dict()
        for key in msg:
            # dumps json of 'img_info'
            # We actually need that only for DIG...
            if key == "img_info":
                self.dict_sha1_infos[strk][key] = json.dumps(msg[key])
            else:
                # discard 'img_buffer' (if it exists?...), and 'sha1'
                # if k != "img_buffer" and k != "sha1":
                #  self.dict_sha1_infos[strk][k] = msg[k]
                # discard 'sha1'
                if key != "sha1":
                    self.dict_sha1_infos[strk][key] = msg[key]

    def cleanup_dict_infos(self, list_del_sha1s):
        """Remove images ``list_del_sha1s`` from ``self.dict_sha1_infos``

    :param list_del_sha1s: list of images sha1 to remove
    :type list_del_sha1s: list
    """
        for sha1 in list_del_sha1s:
            try:
                del self.dict_sha1_infos[str(sha1)]
            except:
                # could happen when cleaning up duplicates or image processed by another process
                pass

    def get_dict_push(self, list_get_sha1s, daemon=False):
        """Get dictionary to be pushed to HBase for images in ``list_get_sha1s``

    :param list_get_sha1s: list of images
    :type list_get_sha1s: list
    :param daemon: whether the checker is running in daemon mode
    :type daemon: bool
    :return: (dict_push, update_id)
    :rtype: tuple
    """
        #TODO: is this needed for every get_dict_push call?
        self.set_check_columns()
        # TODO: also pass current update_id, and move the creation of update id out of this method
        #  this method should actually be used to 'claim' an image as soon as we can.
        dict_push = dict()
        # append processid to 'update_id' for safe use with multiple consumers, even after restart
        # /!\ beware, it should not contain underscores
        tmp_update_id, _ = self.indexer.get_next_update_id(
            today=None, extr_type=self.extr_prefix)
        update_id = tmp_update_id + '-' + self.ingester.pp + '-' + str(
            time.time())
        for sha1 in list_get_sha1s:
            dict_push[str(sha1)] = dict()
            try:
                tmp_dict = self.dict_sha1_infos[str(sha1)]
            except:
                # This would mean the image has been marked as part of another batch by another process,
                # and thus deleted in a previous 'get_unprocessed_rows' call
                # This is also only relevant if we run on Daemon mode...
                # TODO: for transition we won't really have any info to push except the update_id...
                if daemon:
                    del dict_push[str(sha1)]
                    continue
            # build column names properly i.e. appending 'info:'
            for key in tmp_dict:
                # changed to: use column_family from indexer
                # But the use of 'key' here also means we rely on the input to define column name...
                #dict_push[str(sha1)]['info:' + key] = tmp_dict[key]
                dict_push[str(sha1)][self.indexer.imginfocf + ':' +
                                     key] = tmp_dict[key]
            dict_push[str(sha1)][self.batch_check_column] = update_id
        return dict_push, update_id

    def get_unprocessed_rows(self, list_check_sha1s):
        """Get the subset of the list of sha1s ``list_check_sha1s`` that have not been processed yet

    :param list_check_sha1s: list of images sha1 to check
    :type list_check_sha1s: list
    :return: set of unprocessed images
    :rtype: set
    """
        # TODO: also pass current update_id and only delete if != from current update...

        unprocessed_rows = set(list_check_sha1s)

        if list_check_sha1s:
            # Check if the selected sha1 rows in HBase table 'sha1infos' have those check_column
            # This call will only return rows that DO have those check_column
            fam = self.indexer.get_dictcf_sha1_table()
            try:
                sha1s_rows = self.indexer.get_columns_from_sha1_rows(
                    list_check_sha1s, self.check_columns, families=fam)
            except Exception as inst:
                print("[{}.get_unprocessed_rows: log] fam: {}".format(
                    self.pp, fam))
                raise inst

                #families=self.tablesha1_col_families)
            if sha1s_rows:
                # TODO: only delete if really previously processed, i.e. if != from current update...
                found_sha1_rows = set([str(row[0]) for row in sha1s_rows])
                # Clean up 'dict_sha1_infos' deleting found_sha1_rows
                self.cleanup_dict_infos(found_sha1_rows)
                set_list_check_sha1s = set(list_check_sha1s)
                # TODO: but we should not re-add them, so we should discard them from unprocessed_rows
                unprocessed_rows = set_list_check_sha1s - found_sha1_rows

        return unprocessed_rows

    def run(self, daemon=False):
        """Run extraction checker

    :param daemon: whether we are running in daemon mode
    :type daemon: bool
    :raises Exception: if check fails
    """
        # import inspect
        # if not inspect.isgeneratorfunction(self.ingester.get_msg_json()):
        #   msg = "[{}: Warning] Ingester {} function `get_msg_json` is not a generator"
        #   print(msg.format(self.pp, type(self.ingester)))

        try:
            list_sha1s_to_process = []
            list_check_sha1s = []
            # TODO: create update_id here

            if self.verbose > 1:
                msg = "[{}: log] Start run main loop"
                msg.format(self.pp)

            while True:

                try:
                    # Accumulate images infos
                    #while len(list_check_sha1s) < self.indexer.batch_update_size:
                    #while len(list_check_sha1s) < self.min_len_check:

                    for msg in self.ingester.get_msg_json():
                        try:
                            # Fix if input was JSON dumped twice?
                            if not isinstance(msg, dict):
                                msg = json.loads(msg)
                            # msg could now contain keys 'sha1' or 'list_sha1s'
                            if 'sha1' in msg:
                                list_check_sha1s.append(
                                    str(msg['sha1']).upper())
                                # Store other fields to be able to push them too
                                self.store_img_infos(msg)
                            elif 'list_sha1s' in msg:
                                for sha1 in msg['list_sha1s']:
                                    list_check_sha1s.append(str(sha1).upper())
                                    # We won't have any additional infos no?
                                    # But we should still build a dict for each sample for consistency...
                                    tmp_dict = dict()
                                    tmp_dict['sha1'] = str(sha1).upper()
                                    # will basically push a dict with just the sha1 to self.dict_sha1_infos, so self.get_dict_push
                                    # works properly later on...
                                    self.store_img_infos(tmp_dict)
                            else:
                                raise ValueError(
                                    'Unknown keys in msg: {}'.format(
                                        msg.keys()))
                            # This is dangerous, as it assumes the self.ingester.get_msg_json() generator
                            # would restart from the next point... Is this the case for Kafka?
                            prev_len = len(list_check_sha1s)
                            list_check_sha1s = list(set(list_check_sha1s))
                            if len(list_check_sha1s) < prev_len:
                                msg = "[{}: log] Removed {} duplicate from `list_check_sha1s`"
                                print(
                                    msg.format(
                                        self.pp,
                                        prev_len - len(list_check_sha1s)))
                            if len(list_check_sha1s
                                   ) >= self.indexer.batch_update_size:
                                break
                        except Exception as inst:
                            pr_msg = "[{}: ERROR] Could not process message: {}. {}"
                            print(pr_msg.format(self.pp, msg, inst))
                except Exception as inst:
                    pr_msg = "[{}: at {} ERROR] Caught {} {} in consumer loop"
                    now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S')
                    print(pr_msg.format(self.pp, now_str, type(inst), inst))
                    if msg is not None:
                        print(msg)
                    sys.stdout.flush()

                if self.verbose > 3:
                    msg = "[{}: log] Gathered {} images to check so far"
                    msg = msg.format(self.pp, len(list_check_sha1s))
                    msg2 = ""
                    if len(list_check_sha1s) > 0:
                        msg2 = " (first: {}, last: {})"
                        msg2 = msg2.format(list_check_sha1s[0],
                                           list_check_sha1s[-1])
                    print(msg + msg2)

                # To be able to push one (non empty) update every max_delay
                #if not list_check_sha1s and (time.time() - self.last_push) < self.max_delay:
                if len(list_check_sha1s) < self.indexer.batch_update_size and (
                        time.time() - self.last_push) < self.max_delay:
                    time.sleep(1)
                    continue

                self.nb_imgs_check += len(list_check_sha1s)
                push_delay = (time.time() - self.last_push) > max(
                    int(self.max_delay / 60), 10)
                if push_delay and self.nb_imgs_unproc_lastprint != self.nb_imgs_unproc:
                    msg = "[{}: log] Pushed {} unprocessed images so far"
                    print(
                        msg.format(self.pp, self.nb_imgs_unproc,
                                   self.nb_imgs_check))
                    self.nb_imgs_unproc_lastprint = self.nb_imgs_unproc

                if list_check_sha1s:
                    # Check which images have not been processed (or pushed in an update) yet
                    # This seems slow
                    start_check = time.time()
                    unprocessed_rows = self.get_unprocessed_rows(
                        list_check_sha1s)
                    msg = "[{}: log] Found {}/{} unprocessed images in {:.2f}s"
                    print(
                        msg.format(self.pp, len(unprocessed_rows),
                                   len(list_check_sha1s),
                                   time.time() - start_check))
                    if len(unprocessed_rows) != len(
                            list_check_sha1s) and self.verbose > 5:
                        already_processed = list(
                            set(list_check_sha1s) - set(unprocessed_rows))
                        msg = "[{}: log] Images ".format(self.pp)
                        for ap in already_processed:
                            msg += "{} ".format(ap)
                        msg += "were already processed."
                        print(msg)

                    #unprocessed_rows = self.get_unprocessed_rows(list_check_sha1s)

                    # TODO: we should mark those images as being 'owned' by the update we are constructing
                    # (only important if we are running multiple threads i.e. daemon is True)
                    # otherwise another update running at the same time could also claim it (in another ad)
                    # could be handle when adding data to the searcher but duplicates in extraction process...

                    # Push sha1s to be processed
                    for sha1 in unprocessed_rows:
                        list_sha1s_to_process.append(sha1)

                    # Remove potential duplicates
                    list_sha1s_to_process = list(set(list_sha1s_to_process))
                    list_check_sha1s = []

                if list_sha1s_to_process:
                    # Push them to HBase by batch of 'batch_update_size'
                    push_delay = (time.time() -
                                  self.last_push) > self.max_delay
                    full_batch = len(list_sha1s_to_process
                                     ) >= self.indexer.batch_update_size
                    if full_batch or (push_delay and list_sha1s_to_process):
                        # Trim here to push exactly a batch of 'batch_update_size'
                        list_push = list_sha1s_to_process[:min(
                            self.indexer.
                            batch_update_size, len(list_sha1s_to_process))]

                        # TODO: this should be done before,
                        # to 'claim' the images as soon as we plan to process them for this update
                        # Gather corresponding sha1 infos
                        dict_push, update_id = self.get_dict_push(
                            list_push, daemon=daemon)
                        if dict_push:
                            self.nb_imgs_unproc += len(dict_push.keys())
                            msg = "[{}: at {}] Pushing update {} of {} images."
                            now_str = datetime.now().strftime(
                                '%Y-%m-%d:%H.%M.%S')
                            print(
                                msg.format(self.pp, now_str, update_id,
                                           len(dict_push.keys())))
                            sys.stdout.flush()

                            # Push images
                            fam = self.indexer.get_dictcf_sha1_table()
                            if self.verbose > 5:
                                msg = "[{}] Pushing images for update {} with fam {}"
                                print(msg.format(self.pp, update_id, fam))
                            sha1s_table = self.indexer.table_sha1infos_name
                            self.indexer.push_dict_rows(dict_push,
                                                        sha1s_table,
                                                        families=fam)

                            # Build HBase updates dict
                            dict_updates_db = dict()
                            now_str = datetime.now().strftime(
                                '%Y-%m-%d:%H.%M.%S')
                            list_sha1s_col = self.indexer.get_col_listsha1s()
                            dict_updates_db[update_id] = {
                                list_sha1s_col: ','.join(dict_push.keys()),
                                self.indexer.get_col_upcreate(): now_str
                            }
                            # Push it
                            fam = self.indexer.get_dictcf_update_table()
                            if self.verbose > 5:
                                msg = "[{}] Pushing update {} info with fam {}"
                                print(msg.format(self.pp, update_id, fam))
                            self.indexer.push_dict_rows(
                                dict_updates_db,
                                self.indexer.table_updateinfos_name,
                                families=fam)

                            # Build pusher updates dict
                            if self.pusher is not None:
                                dict_updates_kafka = dict()
                                dict_updates_kafka[update_id] = ','.join(
                                    dict_push.keys())
                                # Push it
                                #self.ingester.producer.send(self.updates_out_topic, json.dumps(dict_updates_kafka))
                                #self.pusher.send(self.updates_out_topic, dict_updates_kafka)
                                self.pusher.send(dict_updates_kafka)

                            # Gather any remaining sha1s and clean up infos
                            if len(list_sha1s_to_process
                                   ) > self.indexer.batch_update_size:
                                list_sha1s_to_process = list_sha1s_to_process[
                                    self.indexer.batch_update_size:]
                            else:
                                list_sha1s_to_process = []
                            # if duplicates wrt list_push, remove them. Can this still happen?
                            list_sha1s_to_process = [
                                sh1 for sh1 in list_sha1s_to_process
                                if sh1 not in list_push
                            ]
                            self.cleanup_dict_infos(list_push)
                        else:
                            msg = "[{}: at {}] Nothing to push for update {}"
                            print(
                                msg.format(
                                    self.pp,
                                    datetime.now().strftime(
                                        '%Y-%m-%d:%H.%M.%S'), update_id))
                            sys.stdout.flush()
                        self.last_push = time.time()
                        # TODO: we should create a new update_id here,
                        # and let it claim the potential remaining images in 'list_sha1s_to_process'
                        # sanity check that len(list_sha1s_to_process) == len(self.dict_sha1_infos) ?

                    else:
                        if self.verbose > 3:
                            msg = "[{}: at {}] Gathered {} images so far..."
                            now_str = datetime.now().strftime(
                                '%Y-%m-%d:%H.%M.%S')
                            print(
                                msg.format(self.pp, now_str,
                                           len(list_sha1s_to_process)))

        except Exception as inst:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fulltb = traceback.format_tb(exc_tb)
            raise type(inst)(" {} ({})".format(inst, ''.join(fulltb)))
コード例 #11
0
class ExtractionChecker(ConfReader):

  def __init__(self, global_conf, prefix=default_extr_check_prefix, pid=None):
    self.list_extr_prefix = []
    self.pid = pid
    self.dict_sha1_infos = dict()

    super(ExtractionChecker, self).__init__(global_conf, prefix)

    self.featurizer_type = self.get_required_param("featurizer_type")
    self.detector_type = self.get_required_param("detector_type")
    self.input_type = self.get_required_param("input_type")

    # Need to be build from extraction type and detection input + "_processed"
    self.extr_family_column = "ext"
    tmp_extr_family_column = self.get_param("extr_family_column")
    if tmp_extr_family_column:
      self.extr_family_column = tmp_extr_family_column

    # Max delay
    self.max_delay = 3600
    # self.max_delay = 600
    max_delay = self.get_param("max_delay")
    if max_delay:
      self.max_delay = int(max_delay)
    self.last_push = time.time()
    self.nb_imgs_check = 0
    self.nb_imgs_unproc = 0
    self.nb_imgs_unproc_lastprint = 0

    # Beware, the self.extr_family_column should be added to the indexer families parameter in get_create_table...
    # TODO: should we add the 'ad' column family too here by default
    self.tablesha1_col_families = {'info': dict(), self.extr_family_column: dict()}
    self.list_extr_prefix = [self.featurizer_type, "feat", self.detector_type, self.input_type]
    self.extr_prefix = "_".join(self.list_extr_prefix)
    self.extr_prefix_base_column_name = self.extr_family_column + ":" + self.extr_prefix
    self.extr_check_column = self.extr_prefix_base_column_name + "_processed"
    # Need to be build from extraction type and extraction input + "_batchid"
    self.batch_check_column = self.extr_prefix_base_column_name + "_updateid"
    self.check_columns = [self.extr_check_column, self.batch_check_column]

    self.set_pp()

    # Initialize indexer and ingester
    self.indexer = HBaseIndexerMinimal(self.global_conf,
                                       prefix=self.get_required_param("indexer_prefix"))
    self.ingester = GenericKafkaProcessor(self.global_conf,
                                          prefix=self.get_required_param("check_ingester_prefix"))
    # This will not be set for HBase processing, but checker would keep dying here...
    try:
      self.updates_out_topic = self.ingester.get_required_param("producer_updates_out_topic")
    except Exception as inst:
      print "Could not initialize checker, sleeping for {}s.".format(self.max_delay)
      time.sleep(self.max_delay)
      raise(inst)
    self.ingester.pp = "ec"
    if self.pid:
      self.ingester.pp += str(self.pid)

  def set_pp(self):
    self.pp = "ExtractionChecker."
    self.pp += "-".join(self.list_extr_prefix)
    if self.pid:
      self.pp += "." + str(self.pid)

  def store_img_infos(self, msg):
    strk = str(msg['sha1'])
    self.dict_sha1_infos[strk] = dict()
    for k in msg:
      # dumps json of 'img_info'
      if k == "img_info":
        self.dict_sha1_infos[strk][k] = json.dumps(msg[k])
      else:
        # discard 'img_buffer' (if it exists?...), and 'sha1'
        # if k != "img_buffer" and k != "sha1":
        #  self.dict_sha1_infos[strk][k] = msg[k]
        # discard 'sha1'
        if k != "sha1":
          self.dict_sha1_infos[strk][k] = msg[k]

  def cleanup_dict_infos(self, list_del_sha1s):
    for sha1 in list_del_sha1s:
      try:
        del self.dict_sha1_infos[str(sha1)]
      except:
        # could happen when cleaning up duplicates or image processed by another process
        pass

  def get_dict_push(self, list_get_sha1s):
    # TODO: also pass current update_id, and move the creation of update id out of this method
    #  this method should actually be used to 'claim' an image as soon as we can.
    dict_push = dict()
    # append unique processid to 'update_id' to make it safe to use with multiple consumers, even after a restart.
    # /!\ beware, it should not contain underscores
    tmp_update_id, _ = self.indexer.get_next_update_id(today=None, extr_type=self.extr_prefix)
    update_id = tmp_update_id + '-' + self.ingester.pp + '-' + str(time.time())
    for sha1 in list_get_sha1s:
      dict_push[str(sha1)] = dict()
      try:
        tmp_dict = self.dict_sha1_infos[str(sha1)]
      except:
        # This would mean the image has been marked as part of another batch by another process,
        # and thus deleted in a previous 'get_unprocessed_rows' call
        del dict_push[str(sha1)]
        continue
      # build column names properly i.e. appending 'info:'
      for k in tmp_dict:
        dict_push[str(sha1)]['info:' + k] = tmp_dict[k]
      dict_push[str(sha1)][self.batch_check_column] = update_id
    return dict_push, update_id

  def get_unprocessed_rows(self, list_sha1s_to_check):
    # TODO: also pass current update_id and only delete if != from current update...

    unprocessed_rows = set(list_sha1s_to_check)

    if list_sha1s_to_check:
      # Check if the selected sha1 rows in HBase table 'sha1infos' have those check_column
      # This call will only return rows that DO have those check_column
      sha1s_rows = self.indexer.get_columns_from_sha1_rows(list_sha1s_to_check, self.check_columns,
                                                           families=self.tablesha1_col_families)
      if sha1s_rows:
        # TODO: only delete if really previously processed, i.e. if != from current update...
        found_sha1_rows = set([str(row[0]) for row in sha1s_rows])
        # Clean up 'dict_sha1_infos' deleting found_sha1_rows
        self.cleanup_dict_infos(found_sha1_rows)
        set_list_sha1s_to_check = set(list_sha1s_to_check)
        # TODO: but we should not re-add them, so we should discard them from unprocessed_rows
        unprocessed_rows = set_list_sha1s_to_check - found_sha1_rows

    return unprocessed_rows

  def run(self):
    try:
      list_sha1s_to_process = []
      # TODO: create update_id here

      while True:
        list_sha1s_to_check = []

        try:
          # Accumulate images infos
          for msg_json in self.ingester.consumer:
            msg = json.loads(msg_json.value)
            list_sha1s_to_check.append(str(msg['sha1']))

            # Store other fields to be able to push them too
            self.store_img_infos(msg)

            if len(list_sha1s_to_check) >= self.indexer.batch_update_size:
              break
        except Exception as inst:
          # trying to use 'consumer_timeout_ms' to raise timeout and get last samples
          warn_msg = "[{}: warning] At {}, caught {} {} in consumer loop"
          print warn_msg.format(self.pp, datetime.now().strftime('%Y-%m-%d:%H.%M.%S'), type(inst), inst)
          sys.stdout.flush()

        # Check which images have not been processed (or pushed in an update) yet
        unprocessed_rows = self.get_unprocessed_rows(list_sha1s_to_check)
        self.nb_imgs_check += len(list_sha1s_to_check)
        if (
                time.time() - self.last_push) > self.max_delay / 60 and self.nb_imgs_unproc_lastprint != self.nb_imgs_unproc:
          msg_log = "[{}: log] Found {}/{} unprocessed images"
          print msg_log.format(self.pp, self.nb_imgs_unproc, self.nb_imgs_check)
          self.nb_imgs_unproc_lastprint = self.nb_imgs_unproc

        # TODO: we should mark those images as being 'owned' by the update we are constructing
        #   (only reallyimportant if we are running multiple threads...)
        #   otherwise another update running at the same time could also claim it (if it appears in another ad)
        #   this can be handle when adding data to the searcher but induces duplicates in extraction process...

        # Push sha1s to be processed
        for sha1 in unprocessed_rows:
          list_sha1s_to_process.append(sha1)

        # Remove potential duplicates
        list_sha1s_to_process = list(set(list_sha1s_to_process))

        if list_sha1s_to_process:
          # Push them to HBase by batch of 'batch_update_size'
          if len(list_sha1s_to_process) >= self.indexer.batch_update_size or (
                  (time.time() - self.last_push) > self.max_delay and len(list_sha1s_to_process) > 0):
            # Trim here to push exactly a batch of 'batch_update_size'
            list_push = list_sha1s_to_process[:min(self.indexer.batch_update_size, len(list_sha1s_to_process))]

            # TODO: this should be done before, to 'claim' the images as soon as we plan to process them for this update
            # Gather corresponding sha1 infos
            dict_push, update_id = self.get_dict_push(list_push)
            if dict_push:
              self.nb_imgs_unproc += len(dict_push.keys())
              push_msg = "[{}: at {}] Pushing update {} of {} images."
              print push_msg.format(self.pp, datetime.now().strftime('%Y-%m-%d:%H.%M.%S'), update_id,
                                    len(dict_push.keys()))
              sys.stdout.flush()

              # Push images
              self.indexer.push_dict_rows(dict_push, self.indexer.table_sha1infos_name,
                                          families=self.tablesha1_col_families)

              # Build updates dict
              dict_updates_db = dict()
              dict_updates_kafka = dict()
              dict_updates_db[update_id] = {self.indexer.column_list_sha1s: ','.join(dict_push.keys()),
                                            'info:' + update_str_created: datetime.now().strftime('%Y-%m-%d:%H.%M.%S')}
              dict_updates_kafka[update_id] = ','.join(dict_push.keys())
              # Push them
              self.indexer.push_dict_rows(dict_updates_db, self.indexer.table_updateinfos_name)
              self.ingester.producer.send(self.updates_out_topic, json.dumps(dict_updates_kafka))

              # Gather any remaining sha1s and clean up infos
              if len(list_sha1s_to_process) > self.indexer.batch_update_size:
                list_sha1s_to_process = list_sha1s_to_process[self.indexer.batch_update_size:]
              else:
                list_sha1s_to_process = []
              # if duplicates wrt list_push, remove them. Can this still happen?
              list_sha1s_to_process = [sha1 for sha1 in list_sha1s_to_process if sha1 not in list_push]
              self.cleanup_dict_infos(list_push)
            else:
              no_push_msg = "[{}: at {}] Nothing to push for update {}"
              print no_push_msg.format(self.pp, datetime.now().strftime('%Y-%m-%d:%H.%M.%S'), update_id)
              sys.stdout.flush()
            self.last_push = time.time()
            # TODO: we should create a new update_id here, and let it claim the potential remaining images in 'list_sha1s_to_process'
            # sanity check that len(list_sha1s_to_process) == len(self.dict_sha1_infos) ?

    except Exception as inst:
      exc_type, exc_obj, exc_tb = sys.exc_info()
      fulltb = traceback.format_tb(exc_tb)
      raise type(inst)(" {} ({})".format(inst, ''.join(fulltb)))
from cufacesearch.indexer.hbase_indexer_minimal import HBaseIndexerMinimal
from cufacesearch.detector.utils import show_bbox_from_URL
import numpy as np
import base64

hbim = HBaseIndexerMinimal('../conf/global_conf_test_get_face_hbase.json')

list_sha1s = ['000000D29139258BD3716C94A68CFF54A8A7C033', '000001BF13372B9665A89ED25E8948FC7F99F7F1']
# TODO: use column_family and column_name from indexer
rows = hbim.get_columns_from_sha1_rows(list_sha1s, ['face', 'info:s3_url'])

for sha1, data in rows:
  print sha1, data
  url = data['info:s3_url']
  for key in data:
    if key.startswith('face:'):
      face_bbox = key.split('face:dlib_feat_dlib_face_')[-1].split('_')
      feat_b64 = np.frombuffer(base64.b64decode(data[key]), dtype=np.float32)
      print feat_b64.shape, feat_b64
      show_bbox_from_URL(url, map(int, face_bbox), close_after=1)
コード例 #13
0
# This should be run in the docker
from __future__ import print_function
from cufacesearch.indexer.hbase_indexer_minimal import HBaseIndexerMinimal
import sys

start_row = '0' * 40

# Change that
hbim = HBaseIndexerMinimal(
    '../conf/generated/conf_extraction_lfw_local_dlib.json')
#hbim.get_updates_from_date()
nb_face = 0
nb_image = 0
prev_row = '~'
curr_row = start_row
#print(curr_row)
file_names = []
sha1s = []
#print('Scanning', end='', flush=True)
sys.stdout.write('Scanning')
sys.stdout.flush()
while prev_row != curr_row + '~':
    prev_row = curr_row
    #if prev_row != start_row:
    prev_row += '~'
    #print(prev_row)
    #print('.', end='', flush=True)
    sys.stdout.write('.')
    sys.stdout.flush()
    for row in hbim.scan_from_row(hbim.table_sha1infos_name,
                                  row_start=prev_row,