def setup(self): self.session = boto3.Session(profile_name=self.aws_profile) self.s3 = self.session.resource('s3') # Try to access first to make sure try: self.s3.meta.client.head_bucket(Bucket=self.bucket_name) except botocore.exceptions.ClientError as e: err_msg = "[{}: error] Could not check bucket {} using profile {}" full_trace_error( err_msg.format(self.pp, self.bucket_name, self.aws_profile)) raise e self.bucket = self.s3.Bucket(self.bucket_name) if self.verbose > 0: print "[{}: log] Initialized with bucket '{}' and profile '{}'.".format( self.pp, self.bucket_name, self.aws_profile)
def get_batch_kafka(self): # Read from a kafka topic to allow safer parallelization on different machines try: # Needs to read topic to get update_id and list of sha1s for msg in self.ingester.consumer: msg_dict = json.loads(msg.value) update_id = msg_dict.keys()[0] # NB: Try to get update info and check it was really not processed yet. if self.is_update_unprocessed(update_id): str_list_sha1s = msg_dict[update_id] list_sha1s = str_list_sha1s.split(',') print("[{}.get_batch_kafka: log] Update {} has {} images.".format(self.pp, update_id, len(list_sha1s))) # NB: we could also get 'ext:' of images to double check if extraction was already processed #rows_batch = self.indexer.get_columns_from_sha1_rows(list_sha1s, columns=["info:img_buffer"]) if self.verbose > 3: print("[{}.get_batch_kafka: log] Looking for colums: {}".format(self.pp, [img_buffer_column, self.img_column])) rows_batch = self.indexer.get_columns_from_sha1_rows(list_sha1s, columns=[img_buffer_column, self.img_column]) #print "rows_batch", rows_batch if rows_batch: if self.verbose > 4: print("[{}.get_batch_kafka: log] Yielding for update: {}".format(self.pp, update_id)) yield rows_batch, update_id self.ingester.consumer.commit() if self.verbose > 4: print("[{}.get_batch_kafka: log] After yielding for update: {}".format(self.pp, update_id)) self.last_update_date_id = '_'.join(update_id.split('_')[-2:]) # Should we try to commit offset only at this point? else: print("[{}.get_batch_kafka: log] Did not get any image buffers for the update: {}".format(self.pp, update_id)) else: print("[{}.get_batch_kafka: log] Skipping already processed update: {}".format(self.pp, update_id)) else: print("[{}.get_batch_kafka: log] No update found.".format(self.pp)) # Fall back to checking HBase for unstarted/unfinished updates for rows_batch, update_id in self.get_batch_hbase(): yield rows_batch, update_id except Exception as inst: full_trace_error("[{}.get_batch_kafka: error] {}".format(self.pp, inst))
def get_batch_hbase(self): # legacy implementation: better to have a kafka topic for batches to be processed to allow # safe parallelization on different machines try: for updates in self.indexer.get_unprocessed_updates_from_date(self.last_update_date_id, extr_type=self.extr_prefix): for update_id, update_cols in updates: if self.extr_prefix in update_id: # double check update has not been processed somewhere else if self.is_update_unprocessed(update_id): # double check update was not marked as started recently i.e. by another process if self.is_update_notstarted(update_id, max_delay=TIME_ELAPSED_FAILED): list_sha1s = update_cols[column_list_sha1s].split(',') log_msg = "[{}.get_batch_hbase: log] Update {} has {} images." print(log_msg.format(self.pp, update_id, len(list_sha1s))) # also get 'ext:' to check if extraction was already processed? rows_batch = self.indexer.get_columns_from_sha1_rows(list_sha1s, columns=[img_buffer_column, self.img_column]) # print "rows_batch", rows_batch if rows_batch: yield rows_batch, update_id self.last_update_date_id = '_'.join(update_id.split('_')[-2:]) else: log_msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}" print(log_msg.format(self.pp, update_id)) else: log_msg = "[{}.get_batch_hbase: log] Skipping update started recently: {}" print(log_msg.format(self.pp, update_id)) continue else: log_msg = "[{}.get_batch_hbase: log] Skipping already processed update: {}" print(log_msg.format(self.pp, update_id)) continue else: if self.verbose > 6: log_msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type." print(log_msg.format(self.pp, update_id)) else: print("[{}.get_batch_hbase: log] No unprocessed update found.".format(self.pp)) # Look for updates that have some unprocessed images # TODO: wether we do that or not could be specified by a parameter # as this induces slow down during update... for updates in self.indexer.get_missing_extr_updates_from_date("1970-01-01", extr_type=self.extr_prefix): for update_id, update_cols in updates: if self.extr_prefix in update_id: if column_list_sha1s in update_cols: list_sha1s = update_cols[column_list_sha1s].split(',') log_msg = "[{}.get_batch_hbase: log] Update {} has {} images missing extractions." print(log_msg.format(self.pp, update_id, len(list_sha1s))) sys.stdout.flush() # also get 'ext:' to check if extraction was already processed? rows_batch = self.indexer.get_columns_from_sha1_rows(list_sha1s, columns=[img_buffer_column, self.img_column]) if rows_batch: yield rows_batch, update_id else: log_msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}" print(log_msg.format(self.pp, update_id)) else: log_msg = "[{}.get_batch_hbase: log] Update {} has no images list." print(log_msg.format(self.pp, update_id)) else: log_msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type." print(log_msg.format(self.pp, update_id)) else: log_msg = "[{}.get_batch_hbase: log] No updates with missing extractions found." print(log_msg.format(self.pp)) sys.stdout.flush() except Exception as inst: full_trace_error("[{}.get_batch_hbase: error] {}".format(self.pp, inst))
def init_searcher(self): """ Initialize LOPQ model and searcher from `global_conf` value. """ try: # Try to load pretrained model from storer lopq_model = self.storer.load(self.build_model_str()) if lopq_model is None: raise ValueError("Could not load model from storer.") # if self.verbose > 1: # print("pca_mu.shape: {}".format(lopq_model.pca_mu.shape)) # print("pca_P.shape: {}".format(lopq_model.pca_P.shape)) except Exception as inst: if type(inst) != ValueError: full_trace_error(inst) print("[{}: log] Looks like model was not trained yet ({})".format( self.pp, inst)) self.loaded_pretrain_model = False # Try to get it from public bucket e.g.: # https://s3-us-west-2.amazonaws.com/dig-cu-imagesearchindex/sbpycaffe_feat_full_image_lopq_pca-pca256-subq256-M8-V256_train100000 if self.get_pretrained_model: log_msg = "[{}: log] Trying to retrieve pre-trained model {} from s3" print(log_msg.format(self.pp, self.build_model_str())) from ..common.dl import download_file import pickle try: base_model_path = "https://s3-us-west-2.amazonaws.com/dig-cu-imagesearchindex/" # This can fail with a "retrieval incomplete: got only" ... download_file(base_model_path + self.build_model_str(), self.build_model_str()) lopq_model = pickle.load(open(self.build_model_str(), 'rb')) # Avoid overwritting the model in s3 with s3storer using dig-cu-imagesearchindex bucket is_s3_storer = isinstance(self.storer, S3Storer) if is_s3_storer and self.storer.bucket_name == "dig-cu-imagesearchindex": log_msg = "[{}: log] Skipping saving model {} back to s3" print(log_msg.format(self.pp, self.build_model_str())) else: log_msg = "[{}: log] Saving model {} to storer" print(log_msg.format(self.pp, self.build_model_str())) self.storer.save(self.build_model_str(), lopq_model) log_msg = "[{}: log] Loaded pretrained model {} from s3" print(log_msg.format(self.pp, self.build_model_str())) self.loaded_pretrain_model = True except Exception as inst: log_msg = "[{}: log] Could not loaded pretrained model {} from s3: {}" #print(log_msg.format(self.pp, self.build_model_str(), inst)) full_trace_error( log_msg.format(self.pp, self.build_model_str(), inst)) sys.stdout.flush() else: log_msg = "[{}: log] Skipped retrieving pre-trained model from s3 as requested." print(log_msg.format(self.pp, self.build_model_str())) if not self.loaded_pretrain_model: # This is from our modified LOPQ package... # https://github.com/ColumbiaDVMM/ColumbiaImageSearch/tree/master/workflows/build-lopq-index/lopq/python # 'LOPQModelPCA' could be the type of the model loaded from pickle file # from lopq.model import LOPQModel, LOPQModelPCA # Size of DB should depend on nb_train... How should we properly set the size of this? # It should be nb_train_pca * size_feat + nb_train * size_feat_pca feat_size = get_feat_size(self.featurizer_type) if self.model_type == "lopq_pca": map_size = self.nb_train_pca * feat_size * 4 * 8 map_size += self.nb_train * self.model_params['pca'] * 4 * 8 else: map_size = self.nb_train * feat_size * 4 * 8 self.save_feat_env = lmdb.open('/data/lmdb_feats_' + self.build_model_str(), map_size=int(1.1 * map_size), writemap=True, map_async=True, max_dbs=2) # Train and save model in save_path folder lopq_model = self.train_index() # TODO: we could build a more unique model identifier # (using domain information? sha1/md5 of model parameters? using date of training?) # that would also mean we should list from the storer and guess # (based on date of creation) the correct model above... self.storer.save(self.build_model_str(), lopq_model) # Setup searcher with LOPQ model if lopq_model: # LOPQSearcherLMDB is now the default, as it makes the index more persistent # and potentially more easily usable with multiple processes. if self.lopq_searcher == "LOPQSearcherLMDB": from lopq.search import LOPQSearcherLMDB # TODO: should we get path from a parameter? and/or add model_str to it? # self.searcher = LOPQSearcherLMDB(lopq_model, lmdb_path='./lmdb_index/', id_lambda=str) # self.updates_env = lmdb.open('./lmdb_updates/', map_size=1024 * 1000000 * 1, writemap=True, map_async=True, max_dbs=1) self.searcher = LOPQSearcherLMDB( lopq_model, lmdb_path='/data/lmdb_index_' + self.build_model_str(), id_lambda=str) # How could we properly set the size of this? self.updates_env = lmdb.open('/data/lmdb_updates_' + self.build_model_str(), map_size=1024 * 1000000 * 1, writemap=True, map_async=True, max_dbs=1) self.updates_index_db = self.updates_env.open_db("updates") elif self.lopq_searcher == "LOPQSearcher": from lopq.search import LOPQSearcher self.searcher = LOPQSearcher(lopq_model) else: raise ValueError("Unknown 'lopq_searcher' type: {}".format( self.lopq_searcher))
if __name__ == "__main__": # Get conf file parser = ArgumentParser() parser.add_argument("-c", "--conf", dest="conf_file", required=True) parser.add_argument("-p", "--prefix", dest="prefix", default=default_extr_proc_prefix) options = parser.parse_args() # TODO: should we daemonize that too? # Initialize extraction processor ep = ExtractionProcessor(options.conf_file, prefix=options.prefix) nb_err = 0 print("Extraction processor options are: {}".format(options)) sys.stdout.flush() while True: try: ep.run() nb_err = 0 except Exception as inst: full_trace_error("Extraction processor failed: {}".format(inst)) sys.stdout.flush() break #raise inst # del ep # gc.collect() # time.sleep(10*nb_err) # ep = ExtractionProcessor(options.conf_file, prefix=options.prefix) # nb_err += 1
def get_train_features(self, nb_features, lopq_pca_model=None, nb_min_train=None): if nb_min_train is None: nb_min_train = nb_features if lopq_pca_model: feats_db = self.save_feat_env.open_db("feats_pca") dtype = np.float32 else: feats_db = self.save_feat_env.open_db("feats") from ..featurizer.featsio import get_feat_dtype dtype = get_feat_dtype(self.featurizer_type) nb_saved_feats = self.get_nb_saved_feats(feats_db) nb_features_to_read = nb_features seen_updates = set() if nb_saved_feats < nb_features: print("[{}: log] Gathering {} training samples...".format( self.pp, nb_features)) sys.stdout.flush() start_date = "1970-01-01" done = False # Accumulate until we have enough features, or we have read all features if 'wait_for_nbtrain' # is false and we have at least nb_min_train while not done: for batch_updates in self.indexer.get_updates_from_date( start_date=start_date, extr_type=self.build_extr_str()): for update in batch_updates: # for update in updates: try: # We could check if update has been processed, but if not we won't get features anyway update_id = update[0] if column_list_sha1s in update[1]: if update_id not in seen_updates: list_sha1s = update[1][column_list_sha1s] samples_ids, features = self.indexer.get_features_from_sha1s( list_sha1s.split(','), self.build_extr_str()) if features: # Apply PCA to features here to save memory if lopq_pca_model: np_features = lopq_pca_model.apply_PCA( np.asarray(features)) else: np_features = np.asarray(features) log_msg = "[{}: log] Got features {} from update {}" print( log_msg.format( self.pp, np_features.shape, update_id)) sys.stdout.flush() # just appending like this does not account for duplicates... # train_features.extend(np_features) nb_saved_feats = self.save_feats_to_lmbd( feats_db, samples_ids, np_features) seen_updates.add(update_id) else: if self.verbose > 3: log_msg = "[{}: log] Did not get features from update {}" print( log_msg.format( self.pp, update_id)) sys.stdout.flush() if nb_saved_feats >= nb_features: done = True break else: warn_msg = "[{}: warning] Update {} has no list of images associated to it." print(warn_msg.format(self.pp, update_id)) sys.stdout.flush() except Exception as inst: from cufacesearch.common.error import full_trace_error err_msg = "[{}: error] Failed to get features: {} {}" full_trace_error( err_msg.format(self.pp, type(inst), inst)) sys.stdout.flush() else: if self.verbose > 4: print( "[{}: log] Got {} training samples so far..." .format(self.pp, nb_saved_feats)) sys.stdout.flush() if done: nb_features_to_read = nb_saved_feats break else: if not done: # Wait for new updates... # TODO: could be optional if self.wait_for_nbtrain: if nb_saved_feats >= nb_min_train: log_msg = "[{}: log] Gathered minimum number of training features ({})..." print(log_msg.format(self.pp, nb_min_train)) sys.stdout.flush() break else: log_msg = "[{}: log] Waiting for new updates. Got {} training samples so far..." print(log_msg.format(self.pp, nb_saved_feats)) sys.stdout.flush() time.sleep(60) else: log_msg = "[{}: log] Gathered all available features ({})..." print( log_msg.format( self.pp, self.get_nb_saved_feats(feats_db))) sys.stdout.flush() break return self.get_feats_from_lmbd(feats_db, nb_features_to_read, dtype)
def load_codes(self, full_refresh=False): """Load codes :param full_refresh: wheter to perform a full refresh or not :type full_refresh: bool """ # For multi-workers setting with gunicorn self.set_pp(pp="SearcherLOPQHBase." + str(os.getpid())) # Calling this method can also perfom an update of the index if not self.searcher: info_msg = "[{}.load_codes: info] Not loading codes as searcher is not initialized." print(info_msg.format(self.pp)) return start_load = time.time() total_compute_time = 0 try: # try to get date of last update start_date = "1970-01-01" if not full_refresh: start_date = self.get_latest_update_suffix() extr_str = self.build_extr_str() feat_size = get_feat_size(self.featurizer_type) #feat_type = get_feat_dtype(self.featurizer_type) feat_type = self.featurizer_type # Get all updates ids for the extraction type # TODO: this scan makes the API unresponsive for ~2 minutes during the update process... for batch_updates in self.indexer.get_updates_from_date(start_date=start_date, extr_type=extr_str): for update in batch_updates: update_id = update[0] # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot? if self.is_update_indexed(update_id) and not full_refresh: if self.verbose > 4: print("[{}: log] Skipping update {} already indexed.".format(self.pp, update_id)) continue else: dtn = datetime.now() if self.is_update_processed(update[1]) and not self.skip_update(update_id, dtn): print("[{}: log] Looking for codes of update {}".format(self.pp, update_id)) # Get this update codes codes_string = self.build_codes_string(update_id) try: # Check for precomputed codes codes_dict = self.storer.load(codes_string, silent=True) if codes_dict is None: msg = "[{}: log] Could not load codes from {}" raise ValueError(msg.format(self.pp, codes_string)) # If full_refresh, check that we have as many codes as available features if full_refresh: # Also check for 'completed' flag? if self.indexer.get_col_listsha1s() in update[1]: set_sha1s = set(update[1][self.indexer.get_col_listsha1s()].split(',')) sids, _ = self.indexer.get_features_from_sha1s(list(set_sha1s), extr_str) if len(set(sids)) > len(codes_dict): msg = "[{}: log] Update {} has {} new features" diff_count = len(set(sids)) - len(codes_dict) raise ValueError(msg.format(self.pp, update_id, diff_count)) else: msg = "[{}: log] Skipping update {} indexed with all {}/{} features" print(msg.format(self.pp, update_id, len(codes_dict), len(set(sids)))) miss_extr = self.indexer.get_missing_extr_sha1s(list(set_sha1s), extr_str, skip_failed=self.skipfailed) # If all sha1s have been processed, no need to ever check that update again # Store that information as future date_db to avoid ever checking again... if not miss_extr and self.lopq_searcher == "LOPQSearcherLMDB": dtn = dtn.replace(year=9999) except Exception as inst: # Update codes not available if self.verbose > 3: print(inst) # Compute codes for update not yet processed and save them start_compute = time.time() # Get detections (if any) and features if self.indexer.get_col_listsha1s() in update[1]: list_sha1s = list(set(update[1][self.indexer.get_col_listsha1s()].split(','))) sids, fts = self.indexer.get_features_from_sha1s(list_sha1s, extr_str, feat_type) if fts: if fts[0].shape[-1] != feat_size: msg = "[{}.load_codes: error] Invalid feature size {} vs {} expected" raise ValueError(msg.format(fts[0].shape[-1], feat_size)) codes_dict = self.compute_codes(sids, fts, codes_string) update_compute_time = time.time() - start_compute total_compute_time += update_compute_time if self.verbose > 0: log_msg = "[{}: log] Update {} codes computation done in {}s" print(log_msg.format(self.pp, update_id, update_compute_time)) else: print("[{}: warning] Update {} has no features.".format(self.pp, update_id)) continue else: print("[{}: warning] Update {} has no list of images.".format(self.pp, update_id)) continue # Use new method add_codes_from_dict of searcher self.searcher.add_codes_from_dict(codes_dict) self.add_update(update_id, date_db=dtn) total_load = time.time() - start_load self.last_refresh = datetime.now() print("[{}: log] Total udpates computation time is: {}s".format(self.pp, total_compute_time)) print("[{}: log] Total udpates loading time is: {}s".format(self.pp, total_load)) # Total udpates loading time is: 0.0346581935883s, really? Seems much longer except Exception as inst: full_trace_error("[{}: error] Could not load codes. {}".format(self.pp, inst))
def get_batch_kafka(self): """Get one batch of images from Kafka :yield: tuple (rows_batch, update_id) """ # Read from a kafka topic to allow safer parallelization on different machines # DONE: use in_indexer img_cols = [ self.in_indexer.get_col_imgbuff(), self.in_indexer.get_col_imgurlbak(), self.img_column ] try: # Needs to read topic to get update_id and list of sha1s if self.ingester.consumer: for msg in self.ingester.consumer: msg_dict = json.loads(msg.value) update_id = msg_dict.keys()[0] # NB: Try to get update info and check it was really not processed yet. if self.is_update_unprocessed(update_id): str_list_sha1s = msg_dict[update_id] list_sha1s = str_list_sha1s.split(',') msg = "[{}.get_batch_kafka: log] Update {} has {} images." print(msg.format(self.pp, update_id, len(list_sha1s))) if self.verbose > 3: msg = "[{}.get_batch_kafka: log] Looking for columns: {}" print(msg.format(self.pp, img_cols)) # DONE: use in_indexer rows_batch = self.in_indexer.get_columns_from_sha1_rows( list_sha1s, columns=img_cols) #print "rows_batch", rows_batch if rows_batch: if self.verbose > 4: msg = "[{}.get_batch_kafka: log] Yielding for update: {}" print(msg.format(self.pp, update_id)) yield rows_batch, update_id self.ingester.consumer.commit() if self.verbose > 4: msg = "[{}.get_batch_kafka: log] After yielding for update: {}" print(msg.format(self.pp, update_id)) self.last_update_date_id = '_'.join( update_id.split('_')[-2:]) # Should we try to commit offset only at this point? else: msg = "[{}.get_batch_kafka: log] Did not get any image buffers for the update: {}" print(msg.format(self.pp, update_id)) else: msg = "[{}.get_batch_kafka: log] Skipping already processed update: {}" print(msg.format(self.pp, update_id)) else: print("[{}.get_batch_kafka: log] No update found.".format( self.pp)) # Fall back to checking HBase for unstarted/unfinished updates for rows_batch, update_id in self.get_batch_hbase(): yield rows_batch, update_id else: print("[{}.get_batch_kafka: log] No consumer found.".format( self.pp)) # Fall back to checking HBase for unstarted/unfinished updates for rows_batch, update_id in self.get_batch_hbase(): yield rows_batch, update_id except Exception as inst: full_trace_error("[{}.get_batch_kafka: error] {}".format( self.pp, inst))
def get_batch_hbase(self): """Get one batch of images from HBase :yield: tuple (rows_batch, update_id) """ # legacy implementation: better to have a kafka topic for batches to be processed to allow # safe and efficient parallelization on different machines # DONE: use in_indexer img_cols = [ self.in_indexer.get_col_imgbuff(), self.in_indexer.get_col_imgurlbak(), self.img_column ] try: # DONE: use out_indexer for updates in self.out_indexer.get_unprocessed_updates_from_date( self.last_update_date_id, extr_type=self.extr_prefix): for update_id, update_cols in updates: if self.extr_prefix in update_id: # double check update has not been processed somewhere else if self.is_update_unprocessed(update_id): # double check update was not marked as started recently i.e. by another process if self.is_update_notstarted( update_id, max_delay=TIME_ELAPSED_FAILED): # DONE: use out_indexer list_sha1s = update_cols[ self.out_indexer.get_col_listsha1s( )].split(',') msg = "[{}.get_batch_hbase: log] Update {} has {} images." print( msg.format(self.pp, update_id, len(list_sha1s))) # We should time that, it seems slow i.e. 2/3 minutes per update. try: rows_batch = self.in_indexer.get_columns_from_sha1_rows( list_sha1s, rbs=BATCH_SIZE_IMGBUFFER, columns=img_cols) except Exception: msg = "[{}.get_batch_hbase: warning] Failed retrieving images data for update: {}" print(msg.format(self.pp, update_id)) # flush? sys.stdout.flush() # Update self.last_update_date_id ? #self.last_update_date_id = '_'.join(update_id.split('_')[-2:]) continue # print "rows_batch", rows_batch if rows_batch: yield rows_batch, update_id else: msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}" print(msg.format(self.pp, update_id)) #msg = "[{}.get_batch_hbase: log] Was trying to read columns {} from table {} for rows {}" #print(msg.format(self.pp, img_cols, self.in_indexer.table_sha1infos_name, list_sha1s)) # Store last update id self.last_update_date_id = '_'.join( update_id.split('_')[-2:]) else: msg = "[{}.get_batch_hbase: log] Skipping update started recently: {}" print(msg.format(self.pp, update_id)) continue else: msg = "[{}.get_batch_hbase: log] Skipping already processed update: {}" print(msg.format(self.pp, update_id)) continue else: if self.verbose > 6: msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type." print(msg.format(self.pp, update_id)) else: print("[{}.get_batch_hbase: log] No unprocessed update found.". format(self.pp)) # Should we reinitialized self.last_update_date_id? # Look for updates that have some unprocessed images # TODO: wether we do that or not could be specified by a parameter # as this induces slow down during update... # DONE: use out_indexer count_ucme = 0 stop_cme = False for updates in self.out_indexer.get_missing_extr_updates_from_date( self.last_missing_extr_date, extr_type=self.extr_prefix): for update_id, update_cols in updates: if self.extr_prefix in update_id: # DONE: use out_indexer if self.out_indexer.get_col_listsha1s( ) in update_cols: list_sha1s = update_cols[ self.out_indexer.get_col_listsha1s( )].split(',') msg = "[{}.get_batch_hbase: log] Update {} has {} images missing extractions." print( msg.format(self.pp, update_id, len(list_sha1s))) sys.stdout.flush() # also get 'ext:' to check if extraction was already processed? # DONE: use in_indexer rows_batch = self.in_indexer.get_columns_from_sha1_rows( list_sha1s, rbs=BATCH_SIZE_IMGBUFFER, columns=img_cols) if rows_batch: yield rows_batch, update_id self.last_missing_extr_date = '_'.join( update_id.split('_')[-2:]) count_ucme += 1 if count_ucme >= self.maxucme: stop_cme = True break else: msg = "[{}.get_batch_hbase: log] Did not get any image buffer for update: {}" print(msg.format(self.pp, update_id)) else: msg = "[{}.get_batch_hbase: log] Update {} has no images list." print(msg.format(self.pp, update_id)) else: msg = "[{}.get_batch_hbase: log] Skipping update {} from another extraction type." print(msg.format(self.pp, update_id)) # We have reached maximum number of check for missing extractions in one call if stop_cme: break else: if stop_cme: msg = "[{}.get_batch_hbase: log] Stopped checking updates with missing extractions" msg += "after founding {}/{}." print( msg.format(self.pp, count_ucme, self.maxucme, self.last_missing_extr_date)) msg = "[{}.get_batch_hbase: log] Will restart next time from: {}" print(msg.format(self.pp, self.last_missing_extr_date)) sys.stdout.flush() else: msg = "[{}.get_batch_hbase: log] No updates with missing extractions found." print(msg.format(self.pp)) sys.stdout.flush() # Re-initialize dates just to make sure we don't miss anything self.last_update_date_id = "1970-01-01" self.last_missing_extr_date = "1970-01-01" except Exception as inst: # If we reach this point it is really a succession of failures full_trace_error("[{}.get_batch_hbase: error] {}".format( self.pp, inst)) # Raise Exception to restart process or docker raise inst
parser.add_argument("-p", "--port", dest="port", type=int, default=5000) parser.add_argument("-e", "--endpoint", dest="endpoint", type=str, default="cuimgsearch") options = parser.parse_args() # Load config file print("[API: log] Setting conf file to: {}".format(options.conf_file)) global_conf = json.load(open(options.conf_file, 'rt')) # Initialize searcher object only once while True: try: api.global_searcher = searcher_lopqhbase.SearcherLOPQHBase(global_conf) break except Exception as inst: err_msg = "Failed to initialized searcher ({}): {}".format(type(inst), inst) from cufacesearch.common.error import full_trace_error full_trace_error(err_msg) time.sleep(60) api.global_start_time = datetime.now() api.input_type = api.global_searcher.input_type # Start API searchapi.add_resource(api.APIResponder, '/'+options.endpoint+'/<string:mode>') print("[API: log] Starting Search API on port {} with endpoint '{}'".format(options.port, options.endpoint)) sys.stdout.flush() from gevent.wsgi import WSGIServer http_server = WSGIServer(('', options.port), app) http_server.serve_forever()