コード例 #1
0
 def build_extr_str(self):
   if self.extr_str is None:
     # use generic extractor 'build_extr_str'
     from cufacesearch.extractor.generic_extractor import build_extr_str
     # featurizer_type, detector_type, input_type):
     self.extr_str = build_extr_str(self.featurizer_type, self.detector_type, self.input_type)
   return self.extr_str
コード例 #2
0
  def __init__(self, global_conf, prefix=default_extr_proc_prefix):
    self.extractor = None
    self.nb_empt = 0
    self.nb_err = 0
    self.max_proc_time = 1200 # in seconds. Increased for sbcmdline...
    self.url_input = True

    super(ExtractionProcessor, self).__init__(global_conf, prefix)

    self.input_type = self.get_required_param("input_type")
    self.nb_threads = self.get_required_param("nb_threads")
    self.featurizer_type = self.get_required_param("featurizer_type")
    self.featurizer_prefix = self.get_required_param("featurizer_prefix")
    self.detector_type = self.get_required_param("detector_type")

    # Means we extract feature from the whole image
    if self.detector_type == "full":
      self.detector = None

    self.verbose = 0
    verbose = self.get_param("verbose")
    if verbose:
      self.verbose = int(verbose)

    self.ingestion_input = "kafka"
    ingestion_input = self.get_param("ingestion_input")
    if ingestion_input:
      self.ingestion_input = ingestion_input

    file_input = self.get_param("file_input")
    print("[{}.ExtractionProcessor: log] file_input: {}".format(self.pp, file_input))
    if file_input:
      self.url_input = False
    print("[{}.ExtractionProcessor: log] url_input: {}".format(self.pp, self.url_input))

    if self.url_input:
      self.img_column =  img_URL_column
    else:
      self.img_column = img_path_column
    print("[{}.ExtractionProcessor: log] img_column: {}".format(self.pp, self.img_column))

    # Need to be build from extraction type and detection input + "_processed"
    self.extr_family_column = "ext"
    tmp_extr_family_column = self.get_param("extr_family_column")
    if tmp_extr_family_column:
      self.extr_family_column = tmp_extr_family_column

    self.push_back = False
    push_back = self.get_param("push_back")
    if push_back:
      self.push_back = True

    self.extr_prefix = build_extr_str(self.featurizer_type, self.detector_type, self.input_type)
    self.set_pp()

    # Initialize queues
    self.init_queues()

    # Initialize extractors only once (just one first)
    self.extractors = []
    #for i in range(self.nb_threads):
    #  self.extractors.append(GenericExtractor(self.detector_type, self.featurizer_type, self.input_type,
    #                                  self.extr_family_column, self.featurizer_prefix, self.global_conf))
    self.extractors.append(GenericExtractor(self.detector_type, self.featurizer_type,
                                            self.input_type, self.extr_family_column,
                                            self.featurizer_prefix, self.global_conf))

    # Beware, the self.extr_family_column should be added to the indexer families parameter in get_create_table...
    # What if the table has some other column families?...
    self.tablesha1_col_families = {'info': dict(), self.extr_family_column: dict()}

    # Initialize indexer
    self.indexer = HBaseIndexerMinimal(self.global_conf, prefix=self.get_required_param("indexer_prefix"))
    self.last_update_date_id = ''

    # Initialize ingester
    self.ingester = GenericKafkaProcessor(self.global_conf, prefix=self.get_required_param("proc_ingester_prefix"))
    self.ingester.pp = "ep"
コード例 #3
0
    def __init__(self, global_conf, prefix=DEFAULT_EXTR_PROC_PREFIX):
        """ExtractionProcessor constructor

    :param global_conf_in: configuration file or dictionary
    :type global_conf_in: str, dict
    :param prefix: prefix in configuration
    :type prefix: str
    """
        self.extractor = None
        self.nb_empt = 0
        self.nb_err = 0
        self.max_proc_time = 1200  # in seconds. Increased for sbcmdline...
        self.url_input = True

        super(ExtractionProcessor, self).__init__(global_conf, prefix)

        # TODO: move that to self.read_conf()
        # Get required parameters
        self.input_type = self.get_required_param("input_type")
        self.nb_threads = self.get_required_param("nb_threads")
        self.featurizer_type = self.get_required_param("featurizer_type")
        self.featurizer_prefix = self.get_required_param("featurizer_prefix")
        self.detector_type = self.get_required_param("detector_type")

        # Get optional parameters
        self.verbose = int(self.get_param("verbose", default=0))
        self.ingestion_input = self.get_param("ingestion_input",
                                              default="kafka")
        self.push_back = self.get_param("push_back", default=False)
        file_input = self.get_param("file_input")
        print("[{}.ExtractionProcessor: log] file_input: {}".format(
            self.pp, file_input))
        if file_input:
            self.url_input = False
        print("[{}.ExtractionProcessor: log] url_input: {}".format(
            self.pp, self.url_input))

        # Means we extract feature from the whole image
        if self.detector_type == "full":
            self.detector = None

        self.extr_prefix = build_extr_str(self.featurizer_type,
                                          self.detector_type, self.input_type)
        self.set_pp()

        # Initialize queues
        self.init_queues()

        # Initialize indexer
        # We now have two indexers:
        # - one "in_indexer" for TF table with buffer, img URLs etc...
        # - one "out_indexer" for our table with extractions etc
        # NB: they could be the same if tables are merged...
        self.out_indexer = HBaseIndexerMinimal(
            self.global_conf, prefix=self.get_required_param("indexer_prefix"))
        prefix_in_indexer = self.get_param("in_indexer_prefix", default=False)
        if prefix_in_indexer:
            self.in_indexer = HBaseIndexerMinimal(self.global_conf,
                                                  prefix=prefix_in_indexer)
            insha1tab = self.in_indexer.table_sha1infos_name
            insha1cfs = self.in_indexer.get_dictcf_sha1_table()
            print("[{}] 'in_indexer' sha1 table {} columns are: {}".format(
                self.pp, insha1tab, insha1cfs))
        else:
            print(
                "[{}] empty 'in_indexer_prefix', using out_indexer as in_indexer too."
                .format(self.pp))
            self.in_indexer = self.out_indexer

        # Initialize extractors only once (just one first)
        self.extractors = []
        # DONE: use 'out_indexer'
        self.extractors.append(
            GenericExtractor(self.detector_type, self.featurizer_type,
                             self.input_type, self.out_indexer.extrcf,
                             self.featurizer_prefix, self.global_conf))

        # DONE: use 'in_indexer'
        if self.url_input:
            self.img_column = self.in_indexer.get_col_imgurl()
        else:
            self.img_column = self.in_indexer.get_col_imgpath()
        img_cols = [
            self.in_indexer.get_col_imgbuff(),
            self.in_indexer.get_col_imgurlbak(), self.img_column
        ]
        print("[{}.ExtractionProcessor: log] img_cols: {}".format(
            self.pp, img_cols))

        self.last_update_date_id = ''

        # Initialize ingester
        self.ingester = GenericKafkaProcessor(
            self.global_conf,
            prefix=self.get_required_param("proc_ingester_prefix"))
        self.ingester.pp = "ep"