def run(self):
    from cufacesearch.imgio.imgio import get_buffer_from_URL, get_buffer_from_filepath, buffer_to_B64

    while self.q_in.empty() == False:
      try:
        # The queue should already have items, no need to block
        (sha1, in_img, push_back) = self.q_in.get(False)
      except:
        continue

      try:
        if self.url_input:
          try:
            img_buffer = get_buffer_from_URL(in_img)
          except Exception as inst:
            if self.fallback_pattern:
              # Adding fallback to Tellfinder images here
              # TODO: should we and how could we also update URL in DB?
              img_buffer = get_buffer_from_URL(self.fallback_pattern.format(sha1))
            else:
              raise inst
        else:
          img_buffer = get_buffer_from_filepath(in_img)
        if img_buffer:
          # Push
          self.q_out.put((sha1, buffer_to_B64(img_buffer), push_back, None))
      except Exception as inst:
        self.q_out.put((sha1, None, push_back, inst))

      # Mark as done
      self.q_in.task_done()
    def process_batch(self):
        """Process one batch of images

    :raises Exception: if something goes really wrong
    """
        # Get a new update batch
        try:
            for rows_batch, update_id in self.get_batch():
                start_update = time.time()
                print("[{}] Processing update {} of {} rows.".format(
                    self.pp, update_id, len(rows_batch)))
                sys.stdout.flush()

                # Initialize
                self.nb_empt = 0
                self.init_queues()
                threads = []

                # If we have deleted an extractor at some point or for first batch
                nb_extr_to_create = self.nb_threads - len(self.extractors)
                if nb_extr_to_create:
                    start_create_extractor = time.time()
                    while len(self.extractors) < min(self.nb_threads,
                                                     len(rows_batch)):
                        # DONE: use 'out_indexer'
                        self.extractors.append(
                            GenericExtractor(self.detector_type,
                                             self.featurizer_type,
                                             self.input_type,
                                             self.out_indexer.extrcf,
                                             self.featurizer_prefix,
                                             self.global_conf))
                    msg = "[{}] Created {} extractors in {}s."
                    create_extr_time = time.time() - start_create_extractor
                    print(
                        msg.format(self.pp, len(self.extractors),
                                   create_extr_time))

                # Mark batch as started to be process
                now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S')
                # changed to: self.column_update_started
                #dict_val = {info_column_family + ':' + update_str_started: now_str}
                #dict_val = {self.column_update_started: now_str}
                # DONE: use out_indexer
                dict_val = {self.out_indexer.get_col_upstart(): now_str}
                update_started_dict = {update_id: dict_val}
                # DONE: use out_indexer
                self.out_indexer.push_dict_rows(
                    dict_rows=update_started_dict,
                    table_name=self.out_indexer.table_updateinfos_name)

                # TODO: define a get_buffer_images method
                # --------
                # Push images to queue
                list_in = []
                # For parallelized downloading...
                from Queue import Queue
                nb_imgs_dl = 0
                q_in_dl = Queue(0)
                q_out_dl = Queue(0)

                start_get_buffer = time.time()
                # DONE: use in_indexer in all this scope
                # How could we transfer URL from in table to out table if they are different?...
                for img in rows_batch:
                    # should decode base64
                    #if img_buffer_column in img[1]:
                    if self.in_indexer.get_col_imgbuff() in img[1]:
                        # That's messy...
                        b64buffer = buffer_to_B64(
                            cStringIO.StringIO(
                                img[1][self.in_indexer.get_col_imgbuff()]))
                        tup = (img[0], b64buffer, False)
                        list_in.append(tup)
                    else:
                        # need to re-download, accumulate a list of URLs to download
                        # Deal with img_path_column for local_images_kafka_pusher
                        if self.img_column in img[1]:
                            q_in_dl.put((img[0], img[1][self.img_column],
                                         self.push_back))
                            nb_imgs_dl += 1
                        elif self.in_indexer.get_col_imgurlbak() in img[1]:
                            q_in_dl.put(
                                (img[0],
                                 img[1][self.in_indexer.get_col_imgurlbak()],
                                 self.push_back))
                            nb_imgs_dl += 1
                        else:
                            msg = "[{}: warning] No buffer and no URL/path for image {} !"
                            print(msg.format(self.pp, img[0]))
                            continue

                # Download missing images
                nb_dl = 0
                nb_dl_failed = 0
                if nb_imgs_dl > 0:
                    threads_dl = []
                    for i in range(min(self.nb_threads, nb_imgs_dl)):
                        # should read (url, obj_pos) from self.q_in
                        # and push (url, obj_pos, buffer, img_info, start_process, end_process) to self.q_out
                        thread = ThreadedDownloaderBufferOnly(
                            q_in_dl, q_out_dl, url_input=self.url_input)
                        thread.start()
                        threads_dl.append(thread)

                    q_in_dl.join()

                    # Push downloaded images to list_in too
                    while nb_dl < nb_imgs_dl:
                        # This can block?
                        #sha1, buffer, push_back, inst = q_out_dl.get()
                        try:
                            sha1, buffer, push_back, inst = q_out_dl.get(
                                True, 10)
                        except Exception as queue_err:
                            msg = "[{}: error] Download queue out timed out: {}"
                            print(msg.format(self.pp, queue_err))
                            break
                        nb_dl += 1
                        if inst:
                            if self.verbose > 6:
                                msg = "[{}: log] Could not download image {}, error was: {}"
                                print(msg.format(self.pp, sha1, inst))
                            nb_dl_failed += 1
                        else:
                            if buffer:
                                list_in.append((sha1, buffer, push_back))
                            else:
                                # Is that even possible?
                                msg = "[{}: error] No error but no buffer either for image {}"
                                print(msg.format(self.pp, sha1))

                get_buffer_time = time.time() - start_get_buffer
                msg = "[{}] Got {}/{} image buffers ({}/{} downloaded) for update {} in {}s."
                print(
                    msg.format(self.pp, len(list_in), len(rows_batch),
                               nb_dl - nb_dl_failed, nb_dl, update_id,
                               get_buffer_time))
                sys.stdout.flush()
                # --------

                # TODO: define a get_features method
                # --------
                q_batch_size = int(
                    math.ceil(float(len(list_in)) / self.nb_threads))
                for i, q_batch in enumerate(build_batch(list_in,
                                                        q_batch_size)):
                    self.q_in[i].put(q_batch)

                q_in_size = []
                q_in_size_tot = 0
                for i in range(self.nb_threads):
                    q_in_size.append(self.q_in[i].qsize())
                    q_in_size_tot += q_in_size[i]
                if self.verbose > 5:
                    print("[{}] Total input queues sizes is: {}".format(
                        self.pp, q_in_size_tot))

                # Start daemons...
                thread_creation_failed = [0] * self.nb_threads
                for i in range(self.nb_threads):
                    # one per non empty input queue
                    if q_in_size[i] > 0:
                        try:
                            thread = DaemonBatchExtractor(self.extractors[i],
                                                          self.q_in[i],
                                                          self.q_out[i],
                                                          verbose=self.verbose)
                            # Could get a 'Cannot allocate memory' if we are using too many threads...
                            thread.start()
                            threads.append(thread)
                        except OSError as inst:
                            # Should we try to push self.q_in[i] data to some other thread?
                            msg = "[{}.process_batch: error] Could not start thread #{}: {}"
                            print(msg.format(self.pp, i + 1, inst))
                            thread_creation_failed[i] = 1
                            time.sleep(10 * sum(thread_creation_failed))

                if sum(thread_creation_failed) == self.nb_threads:
                    # We are in trouble...
                    raise ValueError("Could not start any thread...")

                nb_threads_running = len(threads)
                start_process = time.time()
                stop = time.time() + self.max_proc_time
                # Wait for all tasks to be marked as done
                threads_finished = [0] * nb_threads_running
                deleted_extr = [0] * nb_threads_running
                thread_msg = "[{}] Thread {}/{} (pid: {}) "
                while sum(threads_finished) < nb_threads_running:
                    for i in range(nb_threads_running):
                        if sum(threads_finished) == nb_threads_running:
                            sys.stdout.flush()
                            break
                        if threads_finished[i] == 1:
                            continue
                        i_q_in = i + sum(thread_creation_failed[:i + 1])
                        if q_in_size[i_q_in] > 0:
                            # This seems to block forever sometimes, if subprocess crashed?...
                            #self.q_in[i].join()
                            # Manual join with timeout...
                            # https://github.com/python/cpython/blob/3.6/Lib/multiprocessing/queues.py
                            if not self.q_in[
                                    i_q_in]._unfinished_tasks._semlock._is_zero(
                                    ) and time.time() < stop:
                                time.sleep(1)
                            else:
                                if self.q_in[
                                        i_q_in]._unfinished_tasks._semlock._is_zero(
                                        ):
                                    if self.verbose > 5:
                                        msg = thread_msg + "marked as finished because processing seems finished"
                                        print(
                                            msg.format(self.pp, i + 1,
                                                       nb_threads_running,
                                                       threads[i].pid))
                                else:
                                    if self.verbose > 0:
                                        # In this cases does this happen...
                                        msg = thread_msg + "force marked task as done as max_proc_time ({}) has passed."
                                        print(
                                            msg.format(self.pp, i + 1,
                                                       nb_threads_running,
                                                       threads[i].pid,
                                                       self.max_proc_time))
                                        sys.stdout.flush()
                                        # Try to delete corresponding extractor to free memory?
                                        # And reduce number of threads at the end of the loop
                                    try:
                                        self.q_in[i_q_in].task_done()
                                        if deleted_extr[i] == 0:
                                            # we pushed the extractor as self.extractors[i] in a loop of self.nb_threads
                                            # we use i_q_in
                                            del self.extractors[i_q_in]
                                            deleted_extr[i] = 1
                                    except Exception:
                                        pass
                                threads_finished[i] = 1
                        else:
                            if self.verbose > 2:
                                # We actually never gave something to process...
                                msg = thread_msg + "marked as finished because no data was passed to it"
                                print(
                                    msg.format(self.pp, i + 1,
                                               nb_threads_running,
                                               threads[i].pid))
                            threads_finished[i] = 1

                # Cleanup threads to free memory before getting data back
                # Daemon may still be running...
                # and will actually be deleted only when they exit after not getting a batch
                del threads

                # Gather results
                q_out_size = []
                q_out_size_tot = 0
                for i in range(self.nb_threads):
                    q_out_size.append(self.q_out[i].qsize())
                    q_out_size_tot += q_out_size[i]

                if self.verbose > 5:
                    print("[{}: log] Total output queues size is: {}".format(
                        self.pp, q_out_size_tot))
                    sys.stdout.flush()

                # Can get stuck here?
                dict_imgs = dict()
                for i in range(self.nb_threads):
                    if self.verbose > 4:
                        print("[{}] Thread {} q_out_size: {}".format(
                            self.pp, i + 1, q_out_size[i]))
                        sys.stdout.flush()
                    while q_out_size[i] > 0 and not self.q_out[i].empty():
                        if self.verbose > 6:
                            print("[{}] Thread {} q_out is not empty.".format(
                                self.pp, i + 1))
                            sys.stdout.flush()
                        try:
                            batch_out = self.q_out[i].get(True, 10)
                            if self.verbose > 4:
                                msg = "[{}] Got batch of {} features from thread {} q_out."
                                print(
                                    msg.format(self.pp, len(batch_out), i + 1))
                                sys.stdout.flush()
                            for sha1, dict_out in batch_out:
                                dict_imgs[sha1] = dict_out
                        except:
                            if self.verbose > 1:
                                print(
                                    "[{}] Thread {} failed to get from q_out: {}"
                                    .format(self.pp, i + 1))
                                sys.stdout.flush()
                            #pass
                        if self.verbose > 4:
                            print(
                                "[{}] Marking task done in q_out of thread {}."
                                .format(self.pp, i + 1))
                            sys.stdout.flush()
                        self.q_out[i].task_done()

                #if self.verbose > 0:
                print_msg = "[{}] Got features for {}/{} images in {}s."
                proc_time = time.time() - start_process
                print(
                    print_msg.format(self.pp, len(dict_imgs.keys()),
                                     len(list_in), proc_time))
                sys.stdout.flush()
                # --------

                # Push them
                # DONE: use out_indexer
                self.out_indexer.push_dict_rows(
                    dict_rows=dict_imgs,
                    table_name=self.out_indexer.table_sha1infos_name)

                # Mark batch as processed
                now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S')
                # DONE: use out_indexer
                update_processed_dict = {
                    update_id: {
                        self.out_indexer.get_col_upproc(): now_str
                    }
                }
                self.out_indexer.push_dict_rows(
                    dict_rows=update_processed_dict,
                    table_name=self.out_indexer.table_updateinfos_name)

                # Mark as completed if all rows had an extraction
                if len(rows_batch) == len(dict_imgs.keys()):
                    # DONE: use out_indexer
                    update_completed_dict = {
                        update_id: {
                            self.out_indexer.get_col_upcomp(): str(1)
                        }
                    }
                    self.out_indexer.push_dict_rows(
                        dict_rows=update_completed_dict,
                        table_name=self.out_indexer.table_updateinfos_name)

                # Cleanup
                del self.q_in
                del self.q_out

                # To try to adjust a too optimistic nb_threads setting
                # if (sum(thread_creation_failed) > 0 or sum(deleted_extr) > 0) and self.nb_threads > 2:
                #   self.nb_threads -= 1

                msg = "[{}] Completed update {} in {}s."
                print(
                    msg.format(self.pp, update_id,
                               time.time() - start_update))
                sys.stdout.flush()
                self.nb_err = 0

                # Force garbage collection?
                gc.collect()

                # Should we just raise an Exception and restart clean?
                if sum(thread_creation_failed) > 0 or sum(deleted_extr) > 0:
                    raise ValueError(
                        "Something went wrong. Trying to restart clean")

        except Exception as inst:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fulltb = traceback.format_tb(exc_tb)
            raise type(inst)(" {} ({})".format(inst, ''.join(fulltb)))
Ejemplo n.º 3
0
    Yields:
      object: object
    """
    # NB: used Google style documentation here to get yield type recognized
    # """Get all objects in ``prefix_path``
    #
    # :param prefix_path: prefix path
    # :type prefix_path: str
    # :yield: object
    # """
    # NB: here object is a python object loaded from pickle not an S3 Object
    if self.aws_prefix:
      prefix_path = '/'.join([self.aws_prefix, prefix_path])
    for obj in self.list_prefix(prefix_path):
      yield self.load(obj.key)

if __name__ == "__main__":
  # s3_conf = {"aws_profile": "cuimagesearch", "bucket_name": "dig-cu-imagesearchindex"}
  # s3s = S3Storer(s3_conf, prefix="")
  s3_conf = {"aws_profile": "tfhtimagesprod",
             "aws_region": "us-gov-west-1",
             "aws_prefix": "media",
             "bucket_name": "tellfinder-ht-images-prod",
             "pickling": False,
             "verbose": 5}
  s3s = S3Storer(s3_conf, prefix="")
  buffer = s3s.load("2E5BB236C6BE1A96F524EBA33D167C5A1A94D7C9")
  from cufacesearch.imgio.imgio import buffer_to_B64, get_SHA1_img_type_from_B64
  b64buffer = buffer_to_B64(buffer)
  print(len(b64buffer), get_SHA1_img_type_from_B64(b64buffer))