def finalizer(global_conf_file, queueOut, queueFinalizer): print "[finalizer-pid({}): log] Started a finalizer worker at {}".format( os.getpid(), get_now()) sys.stdout.flush() import glob searcher_finalizer = searcher_hbaseremote.Searcher(global_conf_file) print "[finalizer-pid({}): log] Finalizer worker ready at {}".format( os.getpid(), get_now()) sys.stdout.flush() queueFinalizer.put("Finalizer ready") count_workers_ended = 0 sim_pattern = '*-sim_' + str(searcher_finalizer.ratio) + '.txt' sim_partial_pattern = '*-sim_partial.txt' while True: try: print "[finalizer-pid({}): log] Finalizer worker waiting for an update at {}".format( os.getpid(), get_now()) sys.stdout.flush() found_update = False ## Use glob to list of files that would match the simname pattern. list_simfiles = glob.glob(sim_pattern) found_update = finalize_udpate_list(list_simfiles, searcher_finalizer, partial=False) ## Push previously computed similarities for batches that did not complete list_simfiles = glob.glob(sim_partial_pattern) found_update_partial = finalize_udpate_list(list_simfiles, searcher_finalizer, partial=True) # Check if consumers have ended try: end_signal = queueOut.get(block=False) if end_signal == consumer_end_signal: count_workers_ended += 1 print "[finalizer-pid({}): log] {} consumer workers ended out of {} at {}.".format( os.getpid(), count_workers_ended, nb_workers, get_now()) if count_workers_ended == nb_workers: # should we check for intermediate sim patterns to know if consumers are actually still running, or failed? # sim_pattern = '*-sim.txt' # fully done print "[finalizer-pid({}): log] All consumer workers ended at {}. Leaving.".format( os.getpid(), get_now()) return end_finalizer(queueFinalizer) continue except Exception as inst: #timeout pass # Sleep if no updates where found in this loop cycle? if not found_update: time.sleep(time_sleep_noupdate) except Exception as inst: #[finalizer: error] Caught error at 2017-04-14:04.29.23. Leaving. Error was: list index out of range print "[finalizer-pid({}): error] Caught error at {}. Error {} was: {}".format( os.getpid(), get_now(), type(inst), inst)
def subsampler(global_conf_file, nb_subsamples=50, limit_batches=100): # set limit_batches to None to go on forever searcher_subsampler = searcher_hbaseremote.Searcher(global_conf_file) print "[subsampler: log] Subsampler ready." total_batches = 0 need_break = False while True: if need_break: break start_get_batch = time.time() nb_list_sha1s = 0 while nb_list_sha1s<=nb_subsamples: update_id, str_list_sha1s = searcher_subsampler.indexer.get_next_batch_precomp_sim() list_sha1s = str_list_sha1s.split(',') nb_list_sha1s = len(list_sha1s) if update_id is None: print "[subsampler: log] No more update to process." sys.stdout.flush() break else: print "[subsampler: log] Got update {} in {}s".format(update_id, time.time() - start_get_batch) sys.stdout.flush() start_subsample = time.time() subsample_id = 0 subsample_list_sha1s = [] for i,sha1 in enumerate(list_sha1s): if i%nb_subsamples==0 and i>0: subsampled_update_id = update_id+"_"+str(subsample_id) write_subsampled_update(subsampled_update_id, subsample_list_sha1s, searcher_subsampler) subsample_id += 1 total_batches += 1 if limit_batches is not None and total_batches>limit_batches: need_break = True break subsample_list_sha1s = [sha1] else: subsample_list_sha1s.append(sha1) if len(subsample_list_sha1s)>0: subsampled_update_id = update_id+"_"+str(subsample_id) write_subsampled_update(subsampled_update_id, subsample_list_sha1s, searcher_subsampler) total_batches += 1 if limit_batches is not None and total_batches>limit_batches: need_break = True if not need_break: print "[subsampler: log] Sleeping {}s before getting next update.".format(time_sleep) time.sleep(time_sleep)
def consumer(global_conf_file, queueIn, queueOut, queueConsumer): print "[consumer-pid({}): log] Started a consumer worker at {}".format( os.getpid(), get_now()) sys.stdout.flush() searcher_consumer = searcher_hbaseremote.Searcher(global_conf_file) print "[consumer-pid({}): log] Consumer worker ready at {}".format( os.getpid(), get_now()) queueConsumer.put("Consumer ready") sys.stdout.flush() while True: try: ## reads from queueIn print "[consumer-pid({}): log] Consumer worker waiting for update at {}".format( os.getpid(), get_now()) sys.stdout.flush() update_id, valid_sha1s, start_precomp = queueIn.get( True, queue_timeout) if update_id is None: # declare worker ended print "[consumer-pid({}): log] Consumer worker ending at {}".format( os.getpid(), get_now()) return end_consumer(queueIn, queueOut) ## search print "[consumer-pid({}): log] Consumer worker computing similarities for {} valid sha1s of update {} at {}".format( os.getpid(), len(valid_sha1s), update_id, get_now()) sys.stdout.flush() start_search = time.time() # precompute similarities using searcher # for v1 check_indexed_noprecomp #simname, corrupted = searcher_consumer.search_from_sha1_list_get_simname(valid_sha1s, update_id) simname, corrupted = searcher_consumer.search_from_listid_get_simname( valid_sha1s, update_id, check_already_computed=True) elapsed_search = time.time() - start_search print "[consumer-pid({}): log] Consumer worker processed update {} at {}. Search performed in {}s.".format( os.getpid(), update_id, get_now(), elapsed_search) sys.stdout.flush() ## push to queueOut #queueIn.task_done() start_push = time.time() queueOut.put((update_id, simname, valid_sha1s, corrupted, start_precomp, elapsed_search)) print "[consumer-pid({}): log] Consumer worker pushed update {} to queueOut in {}s at {}.".format( os.getpid(), update_id, time.time() - start_push, get_now()) sys.stdout.flush() except Exception as inst: print "[consumer-pid({}): error] Consumer worker caught error at {}. Error was {}".format( os.getpid(), get_now(), inst)
def producer(global_conf_file, queueIn, queueProducer): print "[producer-pid({}): log] Started a producer worker at {}".format( os.getpid(), get_now()) sys.stdout.flush() searcher_producer = searcher_hbaseremote.Searcher(global_conf_file) print "[producer-pid({}): log] Producer worker ready at {}".format( os.getpid(), get_now()) queueProducer.put("Producer ready") while True: try: start_get_batch = time.time() update_id, str_list_sha1s = searcher_producer.indexer.get_next_batch_precomp_sim( ) #queueProducer.put("Producer got batch") print "[producer-pid({}): log] Got batch in {}s at {}".format( os.getpid(), time.time() - start_get_batch, get_now()) sys.stdout.flush() if update_id is None: print "[producer-pid({}): log] No more update to process.".format( os.getpid()) return end_producer(queueIn) else: start_precomp = time.time() # check that sha1s of batch have no precomputed similarities already in sha1_infos table valid_sha1s, not_indexed_sha1s, precomp_sim_sha1s = check_indexed_noprecomp( searcher_producer, str_list_sha1s.split(',')) # should we split valid_sha1s in batches of 100 or something smaller than 10K currently? searcher_producer.indexer.write_batch( [(update_id, { searcher_producer.indexer.precomp_start_marker: 'True' })], searcher_producer.indexer.table_updateinfos_name) # push updates to be processed in queueIn # https://docs.python.org/3/library/multiprocessing.html#multiprocessing.Queue.qsize # qsize raises NotImplemented Error on OS X... #print "[producer: log] Pushing update {} in queue containing {} items at {}.".format(update_id, queueIn.qsize(), get_now()) print "[producer-pid({}): log] Pushing update {} at {}.".format( os.getpid(), update_id, get_now()) sys.stdout.flush() queueIn.put((update_id, valid_sha1s, start_precomp)) print "[producer-pid({}): log] Pushed update {} to queueIn at {}.".format( os.getpid(), update_id, get_now()) sys.stdout.flush() except Exception as inst: print "[producer-pid({}): error] Error at {}. Leaving. Error was: {}".format( os.getpid(), get_now(), inst) return end_producer(queueIn)
flash(flash_message, 'message') headers = {'Content-Type': 'text/html'} sys.stdout.flush() # TODO: pass named arguments instead of flash messages return make_response(render_template('view_similar_images.html'), 200, headers) api.add_resource(APIResponder, '/cu_image_search/<string:mode>') if __name__ == '__main__': parser = ArgumentParser() parser.add_argument("-c", "--conf", dest="conf_file", default=None) options = parser.parse_args() if options.conf_file is not None: print "Setting conf file to: {}".format(options.conf_file) global_conf_file = options.conf_file global_searcher = searcher_hbaseremote.Searcher(global_conf_file) global_start_time = datetime.now() ## This cannot recover from an 'IOError: [Errno 32] Broken pipe' error when client disconnect before response has been sent e.g. nginx timeout at memexproxy... #app.run(debug=True, host='0.0.0.0') #app.run(debug=False, host='0.0.0.0') from gevent.wsgi import WSGIServer http_server = WSGIServer(('', 5000), app) #http_server = WSGIServer(('', 5002), app) http_server.serve_forever()
def finalizer(global_conf_file, queueOut, queueFinalizer): print "[finalizer-pid({}): log] Started a finalizer worker at {}".format( os.getpid(), get_now()) sys.stdout.flush() import glob searcher_finalizer = searcher_hbaseremote.Searcher(global_conf_file) print "[finalizer-pid({}): log] Finalizer worker ready at {}".format( os.getpid(), get_now()) sys.stdout.flush() queueFinalizer.put("Finalizer ready") count_workers_ended = 0 sim_pattern = '*-sim_' + str(searcher_finalizer.ratio) + '.txt' while True: try: ## Read from queueOut print "[finalizer-pid({}): log] Finalizer worker waiting for an update at {}".format( os.getpid(), get_now()) sys.stdout.flush() found_update = False ## Use glob to list of files that would match the simname pattern. list_simfiles = glob.glob(sim_pattern) for simname in list_simfiles: found_update = True start_finalize = time.time() # parse update_id update_id = simname.split('-')[0] print "[finalizer-pid({}): log] Finalizer worker found update {} to finalize at {}".format( os.getpid(), update_id, get_now()) sys.stdout.flush() ## Check if update was not already finished by another finalizer? ## Push computed similarities # format for saving in HBase: # - batch_sim: should be a list of sha1 row key, dict of "s:similar_sha1": dist_value # - batch_mark_precomp_sim: should be a list of sha1 row key, dict of precomp_sim_column: True batch_sim, batch_mark_precomp_sim = format_batch_sim_v2( simname, searcher_finalizer) # push similarities to HBI_table_sim (escorts_images_similar_row_dev) using searcher.indexer.write_batch if batch_sim: searcher_finalizer.indexer.write_batch( batch_sim, searcher_finalizer.indexer.table_sim_name) # push to weekly update table for Amandeep to integrate in DIG week, year = get_week_year() weekly_sim_table_name = searcher_finalizer.indexer.table_sim_name + "_Y{}W{}".format( year, week) print "[finalizer-pid({}): log] weekly table name: {}".format( os.getpid(), weekly_sim_table_name) weekly_sim_table = searcher_finalizer.indexer.get_create_table( weekly_sim_table_name, families={'s': dict()}) searcher_finalizer.indexer.write_batch( batch_sim, weekly_sim_table_name) ## Mark as done # mark precomp_sim true in escorts_images_sha1_infos searcher_finalizer.indexer.write_batch( batch_mark_precomp_sim, searcher_finalizer.indexer.table_sha1infos_name) # mark updated has processed searcher_finalizer.indexer.write_batch([(update_id, { searcher_finalizer.indexer.precomp_end_marker: 'True' })], searcher_finalizer.indexer.table_updateinfos_name) ## Cleanup try: # remove simname os.remove(simname) # remove features file featfn = update_id + '.dat' os.remove(featfn) except Exception as inst: print "[finalizer-pid({}): error] Could not cleanup. Error was: {}".format( os.getpid(), inst) # We don't have start_precomp anymore #print "[finalizer-pid({}): log] Finalize update {} at {} in {}s total.".format(os.getpid(), update_id, get_now(), time.time() - start_precomp) print "[finalizer-pid({}): log] Finalized update {} at {} in {}s.".format( os.getpid(), update_id, get_now(), time.time() - start_finalize) sys.stdout.flush() # if debug: # print "Sleeping for {}s.".format(debug_sleep) # sys.stdout.flush() # time.sleep(debug_sleep) # Check if consumers have ended try: end_signal = queueOut.get(block=False) if end_signal == consumer_end_signal: count_workers_ended += 1 print "[finalizer-pid({}): log] {} consumer workers ended out of {} at {}.".format( os.getpid(), count_workers_ended, nb_workers, get_now()) if count_workers_ended == nb_workers: # should we check for intermediate sim patterns to know if consumers are actually still running, or failed? # sim_pattern = '*-sim.txt' # fully done print "[finalizer-pid({}): log] All consumer workers ended at {}. Leaving.".format( os.getpid(), get_now()) return end_finalizer(queueFinalizer) continue except Exception as inst: #timeout pass # Sleep if no updates where found in this loop cycle? if not found_update: time.sleep(time_sleep) except Exception as inst: #[finalizer: error] Caught error at 2017-04-14:04.29.23. Leaving. Error was: list index out of range print "[finalizer-pid({}): error] Caught error at {}. Error {} was: {}".format( os.getpid(), get_now(), type(inst), inst)
def finalizer(global_conf_file, queueOut, queueFinalizer): print "[finalizer-pid({}): log] Started a finalizer worker at {}".format( os.getpid(), get_now()) sys.stdout.flush() searcher_finalizer = searcher_hbaseremote.Searcher(global_conf_file) print "[finalizer-pid({}): log] Finalizer worker ready at {}".format( os.getpid(), get_now()) queueFinalizer.put("Finalizer ready") count_workers_ended = 0 while True: try: ## Read from queueOut print "[finalizer-pid({}): log] Finalizer worker waiting for an update at {}".format( os.getpid(), get_now()) sys.stdout.flush() # This seems to block (or not getting updates info) even if there are items that have been pushed to the queueOut?? update_id, simname, valid_sha1s, corrupted, start_precomp, elapsed_search = queueOut.get( block=True, timeout=queue_timeout) if update_id is None: count_workers_ended += 1 print "[finalizer-pid({}): log] {} consumer workers ended out of {} at {}.".format( os.getpid(), count_workers_ended, nb_workers, get_now()) #queueOut.task_done() if count_workers_ended == nb_workers: # fully done print "[finalizer-pid({}): log] All consumer workers ended at {}. Leaving.".format( os.getpid(), get_now()) return end_finalizer(queueOut, queueFinalizer) continue print "[finalizer-pid({}): log] Finalizer worker got update {} from queueOut to finalize at {}".format( os.getpid(), update_id, get_now()) sys.stdout.flush() ## Check if update was not already finished by another finalizer? ## Push computed similarities print simname # format for saving in HBase: # - batch_sim: should be a list of sha1 row key, dict of "s:similar_sha1": dist_value # - batch_mark_precomp_sim: should be a list of sha1 row key, dict of precomp_sim_column: True batch_sim, batch_mark_precomp_sim = format_batch_sim( simname, valid_sha1s, corrupted, searcher_finalizer) # push similarities to HBI_table_sim (escorts_images_similar_row_dev) using searcher.indexer.write_batch if batch_sim: searcher_finalizer.indexer.write_batch( batch_sim, searcher_finalizer.indexer.table_sim_name) # push to weekly update table for Amandeep to integrate in DIG week, year = get_week_year() weekly_sim_table_name = searcher_finalizer.indexer.table_sim_name + "_Y{}W{}".format( year, week) print "[finalizer-pid({}): log] weekly table name: {}".format( os.getpid(), weekly_sim_table_name) weekly_sim_table = searcher_finalizer.indexer.get_create_table( weekly_sim_table_name, families={'s': dict()}) searcher_finalizer.indexer.write_batch(batch_sim, weekly_sim_table_name) ## Mark as done # mark precomp_sim true in escorts_images_sha1_infos_dev searcher_finalizer.indexer.write_batch( batch_mark_precomp_sim, searcher_finalizer.indexer.table_sha1infos_name) # mark info:precomp_finish in escorts_images_updates_dev if not corrupted: # do not mark finished if we faced some issue? mark as corrupted? searcher_finalizer.indexer.write_batch( [(update_id, { searcher_finalizer.indexer.precomp_end_marker: 'True' })], searcher_finalizer.indexer.table_updateinfos_name) print "[finalizer-pid({}): log] Finalize update {} at {} in {}s total.".format( os.getpid(), update_id, get_now(), time.time() - start_precomp) sys.stdout.flush() ## Cleanup if simname: try: # remove simname os.remove(simname) # remove features file featfirst = simname.split('sim')[0] featfn = featfirst[:-1] + '.dat' #print "[process_one_update: log] Removing file {}".format(featfn) os.remove(featfn) except Exception as inst: print "[finalizer-pid({}): error] Could not cleanup. Error was: {}".format( os.getpid(), inst) #queueOut.task_done() except Exception as inst: #[finalizer: error] Caught error at 2017-04-14:04.29.23. Leaving. Error was: list index out of range print "[finalizer-pid({}): error] Caught error at {}. Error {} was: {}".format( os.getpid(), get_now(), type(inst), inst)