def persist_similarity_index(): if index is not None: file = os.path.join(cfg.resolve(cfg.FAISS_SETTINGS, cfg.index_path), cfg.resolve(cfg.FAISS_SETTINGS, cfg.index_file)) faiss.write_index(index, file) logger.info("Faiss index saved to disk") else: logger.warning("Can't save, index was not loaded yet!")
def persist_blacklist_index(): path = cfg.resolve(cfg.FAISS_SETTINGS, cfg.index_path) file = os.path.join( path, cfg.resolve(cfg.FAISS_SETTINGS, cfg.index_blacklist_file)) with open(file, 'w') as f: writer = csv.DictWriter(f, fieldnames=['index']) writer.writeheader() blacklist_dict = threadsafe_blacklist_operation(lambda bl: [{ 'index': id } for id in bl]) writer.writerows(blacklist_dict)
def map_index_ids_to_asset_metas(indices_ids): num_entries = np.array(indices_ids).shape[0] asset_metas = [] search = Search(index=cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_prefix) + cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_asset_meta)) search.query = Q('terms', faiss_idx=indices_ids) search = search[:num_entries] response = search.execute() for hit in response: asset_metas.append(AssetMeta(hit.asset_id, hit.cropped_id, hit.faiss_idx)) return asset_metas if response.hits.total > 0 else []
def initialize_retinanet(): global model logger.info('Loading retinanet classification model...') # os.environ["CUDA_VISIBLE_DEVICES"] = "0" logger.info('Setting keras session...') keras.backend.tensorflow_backend.set_session(get_session()) logger.info('Loading model name...') model = models.load_model( cfg.resolve(cfg.RETINANET_MODEL, cfg.model_path) + cfg.resolve(cfg.RETINANET_MODEL, cfg.model_name), backbone_name=cfg.resolve(cfg.RETINANET_MODEL, cfg.backbone_name))
def initialize_elastic_search(): global db_asset, db_cropped, db_asset_meta connections.create_connection(hosts=cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.host), port=cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.port), timeout=20) db_asset = Index( cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_prefix) + cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_asset)) if not db_asset.exists(): db_asset.doc_type(EsAsset) db_asset.create() db_asset_meta = Index( cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_prefix) + cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_asset_meta)) if not db_asset_meta.exists(): db_asset_meta.doc_type(EsAssetMeta) db_asset_meta.create() db_cropped = Index( cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_prefix) + cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_cropped)) if not db_cropped.exists(): db_cropped.doc_type(EsCropped) db_cropped.create() logger.info("Elastic search initialized!")
def backup_persisting_files(): global round_robin_backup_index round_robin_backup_index += 1 round_robin_backup_index %= cfg.resolve_int( cfg.CRON_JOB, cfg.cron_job_round_robin_backups) # copy faiss file path = cfg.resolve(cfg.FAISS_SETTINGS, cfg.index_path) file = os.path.join(path, cfg.resolve(cfg.FAISS_SETTINGS, cfg.index_file)) copyfile(file, file + '.backup_{}'.format(round_robin_backup_index)) # copy blacklist file file = os.path.join( path, cfg.resolve(cfg.FAISS_SETTINGS, cfg.index_blacklist_file)) copyfile(file, file + '.backup_{}'.format(round_robin_backup_index))
def initialize_blacklist(): global blacklist, blacklist_mutex blacklist_mutex = Lock() path = cfg.resolve(cfg.FAISS_SETTINGS, cfg.index_path) if not os.path.exists(path): os.mkdir(path) file = os.path.join( path, cfg.resolve(cfg.FAISS_SETTINGS, cfg.index_blacklist_file)) if not os.path.exists(file): Path(file).touch() with open(file, 'r') as f: reader = csv.DictReader(f) for row in reader: blacklist.append(int(row['index']))
def remove_cropped_if_asset_exists(asset): try: search = Search(index=cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_prefix) + cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_asset_meta)) search.query = Q('match', asset_id=asset.asset_id) search.exclude() for hit in search: idx = '{}-{}'.format(asset.asset_id, hit.cropped_id) s = Search(index=cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_prefix) + cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_cropped)) s.query = Q('match', id=idx) s.delete() search.delete() except: print(sys.exc_info()[0])
def initialize_similarity_index(): global index path = cfg.resolve(cfg.FAISS_SETTINGS, cfg.index_path) if not os.path.exists(path): os.mkdir(path) file = os.path.join(path, cfg.resolve(cfg.FAISS_SETTINGS, cfg.index_file)) if not os.path.exists(file): index = faiss.IndexFlatIP( cfg.resolve_int(cfg.FAISS_SETTINGS, cfg.index_size)) persist_similarity_index() else: try: index = faiss.read_index(file) logger.info("Faiss index loaded") except (OSError, TypeError, NameError): index = faiss.read_index(file) logger.error("Can't load index! Using default empty index")
def initialize_logging(): print('Initializing logging...') # set up logging to file - see previous section for more details logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M', filename=cfg.resolve(cfg.DEFAULT, cfg.log_dir) + cfg.resolve(cfg.DEFAULT, cfg.log_name), filemode='a') # define a Handler which writes INFO messages or higher to the sys.stderr console = logging.StreamHandler() console.setLevel(logging.INFO) # set a format which is simpler for console use formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') # tell the handler to use this format console.setFormatter(formatter) # add the handler to the root logger logging.getLogger('').addHandler(console)
def index_original_image(img, asset): # save original image original_dir = cfg.resolve(cfg.CLASSIFICATION, cfg.original_images_path) if not os.path.exists(original_dir): os.makedirs(original_dir) logger.info('Created new dir: {}'.format(original_dir)) ori_file_name = '{}/{}.png'.format(original_dir, asset.asset_id) cv2.imwrite(ori_file_name, img) # insert asset into database logger.info('Creating validation generator...') es_asset = EsAsset(meta={'id': asset.asset_id}, asset_id=asset.asset_id, path=ori_file_name) es_asset.save() return ori_file_name
def index_cropped_image(asset, img, label_name, idx, insert=False): # save cropped image extraction_dir = '{}/{}'.format(cfg.resolve(cfg.CLASSIFICATION, cfg.extracted_images_path), label_name) if not os.path.exists(extraction_dir): os.makedirs(extraction_dir) logger.info('Created new dir: {}'.format(extraction_dir)) cropped_file_name = '{}/{}-{}.png'.format(extraction_dir, asset.asset_id, idx) logger.info('Extracted image: {}'.format(cropped_file_name)) converted_img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) cv2.imwrite(cropped_file_name, converted_img) # insert cropped image into database if insert: es_cropped = EsCropped(meta={'id': '{}-{}'.format(asset.asset_id, idx)}, asset_id=asset.asset_id, path=cropped_file_name) es_cropped.save() return cropped_file_name
@app.route('/services/v1/shutdown', methods=['GET']) def shutdown_hook(): core.trigger_backup() sys.exit() @app.route('/services/v1/index/init', methods=['GET']) def init_similarity_index(): core.initialize_elastic_search() return Response(status=200) @app.before_first_request def initialize(): core.initialize_similarity_index() core.initialize_blacklist() core.initialize_elastic_search() core.initialize_retinanet() core.initialize_extraction_model() core.initialize_cron_job() if __name__ == '__main__': core.initialize_logging() logger.info('Server app started!') app.run(host=cfg.resolve(cfg.RETINANET_SERVER, cfg.host), port=cfg.resolve_int(cfg.RETINANET_SERVER, cfg.port), debug=cfg.resolve_bool(cfg.RETINANET_SERVER, cfg.debug), threaded=cfg.resolve_bool(cfg.RETINANET_SERVER, cfg.threaded))
class Meta: index = cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_prefix) + \ cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_asset_meta)
class Meta: index = cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_prefix) + \ cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_cropped)
def classify_content(content): # create a generator for fetching data urls = [] for asset in content.assets: urls.append(asset.url) # prepare images for download val_generator = UrlGenerator(urls, cfg.resolve(cfg.RETINANET_MODEL, cfg.classes_file), cfg.resolve(cfg.RETINANET_MODEL, cfg.labels_file)) response = Response() # load image for i, asset in enumerate(content.assets): logger.info('Running classification on: {}'.format(asset.url)) # initialize result object result = Result() result.url = asset.url result.asset_id = asset.asset_id logger.info('Reading image bgr...') try: # fetch images image = val_generator.read_image_bgr(i) # index original image for searching if content.insert: index_original_image(image, asset) except (OSError, ConnectTimeout, HTTPError, ReadTimeout, Timeout, ConnectionError): logger.warning('Skipped: Unable to reach resource') continue except: err = traceback.format_exc() logger.error('Could not read image: {}'.format(err)) continue # copy to draw on logger.info('Drawing cvt color...') draw = np.asarray(image.copy()) draw = cv2.cvtColor(draw, cv2.COLOR_BGR2RGB) # pre-process the image for the network logger.info('Processing image...') image = val_generator.preprocess_image(image) image, scale = val_generator.resize_image(image) # classify image start = time.time() boxes, scores, labels = core.model.predict_on_batch(np.expand_dims(image, axis=0)) elapsed = time.time() - start logger.info('Processing time: {}'.format(elapsed)) result.time = str(elapsed) boxes /= scale # process and save detections idx = 0 for box, score, label in zip(boxes[0], scores[0], labels[0]): if score < cfg.resolve_float(cfg.CLASSIFICATION, cfg.min_confidence): continue # get position data box = boxes[0, idx, :4].astype(int) label_name = val_generator.label_to_name(label) # save meta-info for REST API response caption = Caption(str(label), label_name, str(score), '{};{}'.format(box[0], box[1]), # x1;y1 '{};{}'.format(box[2], box[3])) # x2;y2 result.captions.append(caption) # Crop image for extraction h = box[3] - box[1] w = box[2] - box[0] cropped_img = draw[box[1]:(box[1] + h), box[0]:(box[0] + w)] if content.insert: # update sequence to remove previous index if available remove_cropped_if_asset_exists(asset) # process cropped image fragment for searching cropped_file_name = index_cropped_image(asset, cropped_img, label_name, idx, insert=content.insert) features = extract_features(cropped_file_name) faiss_features = features.reshape((1, cfg.resolve_int(cfg.FAISS_SETTINGS, cfg.index_size))) # add or clean image if content.insert: # add feature to faiss index core.index.add(faiss_features) else: # clean temp image again os.remove(cropped_file_name) # index caption if content.insert: index_asset_meta(asset, idx, caption, features.tolist(), core.index.ntotal - 1) # find similar suggestions and handle response asset_metas = get_similar_asset_metas(faiss_features, cfg.resolve_int(cfg.FAISS_SETTINGS, cfg.index_n_similar_results)) handle_suggestion_response(result, asset.asset_id, asset_metas) idx += 1 # add result to response list response.result_list.append(result) return response
import config_accessor as cfg from elasticsearch_dsl import DocType, Keyword, Text import logging logger = logging.getLogger('celum.models_es') search_index_prefix = cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_prefix) class EsAsset(DocType): asset_id = Keyword() path = Text() class Meta: index = cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_prefix) + \ cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_asset) def save(self, **kwargs): return super(EsAsset, self).save(**kwargs) class EsAssetMeta(DocType): asset_id = Keyword() cropped_id = Keyword() faiss_idx = Text() label = Text() score = Text() top_left = Text() bottom_right = Text() feature = Text()