def finalizer(global_conf_file, queueOut, queueFinalizer):
    print "[finalizer-pid({}): log] Started a finalizer worker at {}".format(
        os.getpid(), get_now())
    sys.stdout.flush()
    import glob
    searcher_finalizer = searcher_hbaseremote.Searcher(global_conf_file)
    print "[finalizer-pid({}): log] Finalizer worker ready at {}".format(
        os.getpid(), get_now())
    sys.stdout.flush()
    queueFinalizer.put("Finalizer ready")
    count_workers_ended = 0
    sim_pattern = '*-sim_' + str(searcher_finalizer.ratio) + '.txt'
    sim_partial_pattern = '*-sim_partial.txt'
    while True:
        try:
            print "[finalizer-pid({}): log] Finalizer worker waiting for an update at {}".format(
                os.getpid(), get_now())
            sys.stdout.flush()
            found_update = False

            ## Use glob to list of files that would match the simname pattern.
            list_simfiles = glob.glob(sim_pattern)
            found_update = finalize_udpate_list(list_simfiles,
                                                searcher_finalizer,
                                                partial=False)

            ## Push previously computed similarities for batches that did not complete
            list_simfiles = glob.glob(sim_partial_pattern)
            found_update_partial = finalize_udpate_list(list_simfiles,
                                                        searcher_finalizer,
                                                        partial=True)

            # Check if consumers have ended
            try:
                end_signal = queueOut.get(block=False)
                if end_signal == consumer_end_signal:
                    count_workers_ended += 1
                    print "[finalizer-pid({}): log] {} consumer workers ended out of {} at {}.".format(
                        os.getpid(), count_workers_ended, nb_workers,
                        get_now())
                    if count_workers_ended == nb_workers:
                        # should we check for intermediate sim patterns to know if consumers are actually still running, or failed?
                        # sim_pattern = '*-sim.txt'
                        # fully done
                        print "[finalizer-pid({}): log] All consumer workers ended at {}. Leaving.".format(
                            os.getpid(), get_now())
                        return end_finalizer(queueFinalizer)
                    continue
            except Exception as inst:  #timeout
                pass

            # Sleep if no updates where found in this loop cycle?
            if not found_update:
                time.sleep(time_sleep_noupdate)

        except Exception as inst:
            #[finalizer: error] Caught error at 2017-04-14:04.29.23. Leaving. Error was: list index out of range
            print "[finalizer-pid({}): error] Caught error at {}. Error {} was: {}".format(
                os.getpid(), get_now(), type(inst), inst)
Beispiel #2
0
def subsampler(global_conf_file, nb_subsamples=50, limit_batches=100):
    # set limit_batches to None to go on forever
    searcher_subsampler = searcher_hbaseremote.Searcher(global_conf_file)
    print "[subsampler: log] Subsampler ready."
    
    total_batches = 0
    need_break = False

    while True:
        if need_break:
            break

        start_get_batch = time.time()
        nb_list_sha1s = 0
        while nb_list_sha1s<=nb_subsamples:
            update_id, str_list_sha1s = searcher_subsampler.indexer.get_next_batch_precomp_sim()
            list_sha1s = str_list_sha1s.split(',')
            nb_list_sha1s = len(list_sha1s)
        
        if update_id is None:
            print "[subsampler: log] No more update to process."
            sys.stdout.flush()
            break
        else:
            print "[subsampler: log] Got update {} in {}s".format(update_id, time.time() - start_get_batch)
            sys.stdout.flush()
        
            start_subsample = time.time()
            subsample_id = 0
            subsample_list_sha1s = []
            
            for i,sha1 in enumerate(list_sha1s):
                if i%nb_subsamples==0 and i>0:
                    subsampled_update_id = update_id+"_"+str(subsample_id)
                    write_subsampled_update(subsampled_update_id, subsample_list_sha1s, searcher_subsampler)
                    subsample_id += 1
                    total_batches += 1
                    if limit_batches is not None and total_batches>limit_batches:
                        need_break = True
                        break
                    subsample_list_sha1s = [sha1]
                else:
                    subsample_list_sha1s.append(sha1)
            if len(subsample_list_sha1s)>0:
                subsampled_update_id = update_id+"_"+str(subsample_id)
                write_subsampled_update(subsampled_update_id, subsample_list_sha1s, searcher_subsampler)
                total_batches += 1
                if limit_batches is not None and total_batches>limit_batches:
                    need_break = True
                    
        if not need_break:
            print "[subsampler: log] Sleeping {}s before getting next update.".format(time_sleep)     
            time.sleep(time_sleep)
def consumer(global_conf_file, queueIn, queueOut, queueConsumer):
    print "[consumer-pid({}): log] Started a consumer worker at {}".format(
        os.getpid(), get_now())
    sys.stdout.flush()
    searcher_consumer = searcher_hbaseremote.Searcher(global_conf_file)
    print "[consumer-pid({}): log] Consumer worker ready at {}".format(
        os.getpid(), get_now())
    queueConsumer.put("Consumer ready")
    sys.stdout.flush()
    while True:
        try:
            ## reads from queueIn
            print "[consumer-pid({}): log] Consumer worker waiting for update at {}".format(
                os.getpid(), get_now())
            sys.stdout.flush()
            update_id, valid_sha1s, start_precomp = queueIn.get(
                True, queue_timeout)
            if update_id is None:
                # declare worker ended
                print "[consumer-pid({}): log] Consumer worker ending at {}".format(
                    os.getpid(), get_now())
                return end_consumer(queueIn, queueOut)
            ## search
            print "[consumer-pid({}): log] Consumer worker computing similarities for {} valid sha1s of update {} at {}".format(
                os.getpid(), len(valid_sha1s), update_id, get_now())
            sys.stdout.flush()
            start_search = time.time()
            # precompute similarities using searcher
            # for v1 check_indexed_noprecomp
            #simname, corrupted = searcher_consumer.search_from_sha1_list_get_simname(valid_sha1s, update_id)
            simname, corrupted = searcher_consumer.search_from_listid_get_simname(
                valid_sha1s, update_id, check_already_computed=True)
            elapsed_search = time.time() - start_search
            print "[consumer-pid({}): log] Consumer worker processed update {} at {}. Search performed in {}s.".format(
                os.getpid(), update_id, get_now(), elapsed_search)
            sys.stdout.flush()
            ## push to queueOut
            #queueIn.task_done()
            start_push = time.time()
            queueOut.put((update_id, simname, valid_sha1s, corrupted,
                          start_precomp, elapsed_search))
            print "[consumer-pid({}): log] Consumer worker pushed update {} to queueOut in {}s at {}.".format(
                os.getpid(), update_id,
                time.time() - start_push, get_now())
            sys.stdout.flush()
        except Exception as inst:
            print "[consumer-pid({}): error] Consumer worker caught error at {}. Error was {}".format(
                os.getpid(), get_now(), inst)
def producer(global_conf_file, queueIn, queueProducer):
    print "[producer-pid({}): log] Started a producer worker at {}".format(
        os.getpid(), get_now())
    sys.stdout.flush()
    searcher_producer = searcher_hbaseremote.Searcher(global_conf_file)
    print "[producer-pid({}): log] Producer worker ready at {}".format(
        os.getpid(), get_now())
    queueProducer.put("Producer ready")
    while True:
        try:
            start_get_batch = time.time()
            update_id, str_list_sha1s = searcher_producer.indexer.get_next_batch_precomp_sim(
            )
            #queueProducer.put("Producer got batch")
            print "[producer-pid({}): log] Got batch in {}s at {}".format(
                os.getpid(),
                time.time() - start_get_batch, get_now())
            sys.stdout.flush()
            if update_id is None:
                print "[producer-pid({}): log] No more update to process.".format(
                    os.getpid())
                return end_producer(queueIn)
            else:
                start_precomp = time.time()
                # check that sha1s of batch have no precomputed similarities already in sha1_infos table
                valid_sha1s, not_indexed_sha1s, precomp_sim_sha1s = check_indexed_noprecomp(
                    searcher_producer, str_list_sha1s.split(','))
                # should we split valid_sha1s in batches of 100 or something smaller than 10K currently?
                searcher_producer.indexer.write_batch(
                    [(update_id, {
                        searcher_producer.indexer.precomp_start_marker: 'True'
                    })], searcher_producer.indexer.table_updateinfos_name)
                # push updates to be processed in queueIn
                # https://docs.python.org/3/library/multiprocessing.html#multiprocessing.Queue.qsize
                # qsize raises NotImplemented Error on OS X...
                #print "[producer: log] Pushing update {} in queue containing {} items at {}.".format(update_id, queueIn.qsize(), get_now())
                print "[producer-pid({}): log] Pushing update {} at {}.".format(
                    os.getpid(), update_id, get_now())
                sys.stdout.flush()
                queueIn.put((update_id, valid_sha1s, start_precomp))
                print "[producer-pid({}): log] Pushed update {} to queueIn at {}.".format(
                    os.getpid(), update_id, get_now())
                sys.stdout.flush()
        except Exception as inst:
            print "[producer-pid({}): error] Error at {}. Leaving. Error was: {}".format(
                os.getpid(), get_now(), inst)
            return end_producer(queueIn)
        flash(flash_message, 'message')
        headers = {'Content-Type': 'text/html'}
        sys.stdout.flush()
        # TODO: pass named arguments instead of flash messages
        return make_response(render_template('view_similar_images.html'), 200,
                             headers)


api.add_resource(APIResponder, '/cu_image_search/<string:mode>')

if __name__ == '__main__':

    parser = ArgumentParser()
    parser.add_argument("-c", "--conf", dest="conf_file", default=None)
    options = parser.parse_args()
    if options.conf_file is not None:
        print "Setting conf file to: {}".format(options.conf_file)
        global_conf_file = options.conf_file

    global_searcher = searcher_hbaseremote.Searcher(global_conf_file)
    global_start_time = datetime.now()

    ## This cannot recover from an 'IOError: [Errno 32] Broken pipe' error when client disconnect before response has been sent e.g. nginx timeout at memexproxy...
    #app.run(debug=True, host='0.0.0.0')
    #app.run(debug=False, host='0.0.0.0')

    from gevent.wsgi import WSGIServer
    http_server = WSGIServer(('', 5000), app)
    #http_server = WSGIServer(('', 5002), app)
    http_server.serve_forever()
def finalizer(global_conf_file, queueOut, queueFinalizer):
    print "[finalizer-pid({}): log] Started a finalizer worker at {}".format(
        os.getpid(), get_now())
    sys.stdout.flush()
    import glob
    searcher_finalizer = searcher_hbaseremote.Searcher(global_conf_file)
    print "[finalizer-pid({}): log] Finalizer worker ready at {}".format(
        os.getpid(), get_now())
    sys.stdout.flush()
    queueFinalizer.put("Finalizer ready")
    count_workers_ended = 0
    sim_pattern = '*-sim_' + str(searcher_finalizer.ratio) + '.txt'
    while True:
        try:
            ## Read from queueOut
            print "[finalizer-pid({}): log] Finalizer worker waiting for an update at {}".format(
                os.getpid(), get_now())
            sys.stdout.flush()
            found_update = False

            ## Use glob to list of files that would match the simname pattern.
            list_simfiles = glob.glob(sim_pattern)

            for simname in list_simfiles:
                found_update = True
                start_finalize = time.time()

                # parse update_id
                update_id = simname.split('-')[0]

                print "[finalizer-pid({}): log] Finalizer worker found update {} to finalize at {}".format(
                    os.getpid(), update_id, get_now())
                sys.stdout.flush()

                ## Check if update was not already finished by another finalizer?

                ## Push computed similarities

                # format for saving in HBase:
                # - batch_sim: should be a list of sha1 row key, dict of "s:similar_sha1": dist_value
                # - batch_mark_precomp_sim: should be a list of sha1 row key, dict of precomp_sim_column: True
                batch_sim, batch_mark_precomp_sim = format_batch_sim_v2(
                    simname, searcher_finalizer)

                # push similarities to HBI_table_sim (escorts_images_similar_row_dev) using searcher.indexer.write_batch
                if batch_sim:
                    searcher_finalizer.indexer.write_batch(
                        batch_sim, searcher_finalizer.indexer.table_sim_name)
                    # push to weekly update table for Amandeep to integrate in DIG
                    week, year = get_week_year()
                    weekly_sim_table_name = searcher_finalizer.indexer.table_sim_name + "_Y{}W{}".format(
                        year, week)
                    print "[finalizer-pid({}): log] weekly table name: {}".format(
                        os.getpid(), weekly_sim_table_name)
                    weekly_sim_table = searcher_finalizer.indexer.get_create_table(
                        weekly_sim_table_name, families={'s': dict()})
                    searcher_finalizer.indexer.write_batch(
                        batch_sim, weekly_sim_table_name)

                    ## Mark as done
                    # mark precomp_sim true in escorts_images_sha1_infos
                    searcher_finalizer.indexer.write_batch(
                        batch_mark_precomp_sim,
                        searcher_finalizer.indexer.table_sha1infos_name)
                    # mark updated has processed
                    searcher_finalizer.indexer.write_batch([(update_id, {
                        searcher_finalizer.indexer.precomp_end_marker:
                        'True'
                    })], searcher_finalizer.indexer.table_updateinfos_name)

                ## Cleanup
                try:
                    # remove simname
                    os.remove(simname)
                    # remove features file
                    featfn = update_id + '.dat'
                    os.remove(featfn)
                except Exception as inst:
                    print "[finalizer-pid({}): error] Could not cleanup. Error was: {}".format(
                        os.getpid(), inst)

                # We don't have start_precomp anymore
                #print "[finalizer-pid({}): log] Finalize update {} at {} in {}s total.".format(os.getpid(), update_id, get_now(), time.time() - start_precomp)
                print "[finalizer-pid({}): log] Finalized update {} at {} in {}s.".format(
                    os.getpid(), update_id, get_now(),
                    time.time() - start_finalize)
                sys.stdout.flush()
                # if debug:
                #     print "Sleeping for {}s.".format(debug_sleep)
                #     sys.stdout.flush()
                #     time.sleep(debug_sleep)

            # Check if consumers have ended
            try:
                end_signal = queueOut.get(block=False)
                if end_signal == consumer_end_signal:
                    count_workers_ended += 1
                    print "[finalizer-pid({}): log] {} consumer workers ended out of {} at {}.".format(
                        os.getpid(), count_workers_ended, nb_workers,
                        get_now())
                    if count_workers_ended == nb_workers:
                        # should we check for intermediate sim patterns to know if consumers are actually still running, or failed?
                        # sim_pattern = '*-sim.txt'
                        # fully done
                        print "[finalizer-pid({}): log] All consumer workers ended at {}. Leaving.".format(
                            os.getpid(), get_now())
                        return end_finalizer(queueFinalizer)
                    continue
            except Exception as inst:  #timeout
                pass

            # Sleep if no updates where found in this loop cycle?
            if not found_update:
                time.sleep(time_sleep)

        except Exception as inst:
            #[finalizer: error] Caught error at 2017-04-14:04.29.23. Leaving. Error was: list index out of range
            print "[finalizer-pid({}): error] Caught error at {}. Error {} was: {}".format(
                os.getpid(), get_now(), type(inst), inst)
def finalizer(global_conf_file, queueOut, queueFinalizer):
    print "[finalizer-pid({}): log] Started a finalizer worker at {}".format(
        os.getpid(), get_now())
    sys.stdout.flush()
    searcher_finalizer = searcher_hbaseremote.Searcher(global_conf_file)
    print "[finalizer-pid({}): log] Finalizer worker ready at {}".format(
        os.getpid(), get_now())
    queueFinalizer.put("Finalizer ready")
    count_workers_ended = 0
    while True:
        try:
            ## Read from queueOut
            print "[finalizer-pid({}): log] Finalizer worker waiting for an update at {}".format(
                os.getpid(), get_now())
            sys.stdout.flush()
            # This seems to block (or not getting updates info) even if there are items that have been pushed to the queueOut??
            update_id, simname, valid_sha1s, corrupted, start_precomp, elapsed_search = queueOut.get(
                block=True, timeout=queue_timeout)
            if update_id is None:
                count_workers_ended += 1
                print "[finalizer-pid({}): log] {} consumer workers ended out of {} at {}.".format(
                    os.getpid(), count_workers_ended, nb_workers, get_now())
                #queueOut.task_done()
                if count_workers_ended == nb_workers:
                    # fully done
                    print "[finalizer-pid({}): log] All consumer workers ended at {}. Leaving.".format(
                        os.getpid(), get_now())
                    return end_finalizer(queueOut, queueFinalizer)
                continue
            print "[finalizer-pid({}): log] Finalizer worker got update {} from queueOut to finalize at {}".format(
                os.getpid(), update_id, get_now())
            sys.stdout.flush()

            ## Check if update was not already finished by another finalizer?

            ## Push computed similarities
            print simname
            # format for saving in HBase:
            # - batch_sim: should be a list of sha1 row key, dict of "s:similar_sha1": dist_value
            # - batch_mark_precomp_sim: should be a list of sha1 row key, dict of precomp_sim_column: True
            batch_sim, batch_mark_precomp_sim = format_batch_sim(
                simname, valid_sha1s, corrupted, searcher_finalizer)

            # push similarities to HBI_table_sim (escorts_images_similar_row_dev) using searcher.indexer.write_batch
            if batch_sim:
                searcher_finalizer.indexer.write_batch(
                    batch_sim, searcher_finalizer.indexer.table_sim_name)
                # push to weekly update table for Amandeep to integrate in DIG
                week, year = get_week_year()
                weekly_sim_table_name = searcher_finalizer.indexer.table_sim_name + "_Y{}W{}".format(
                    year, week)
                print "[finalizer-pid({}): log] weekly table name: {}".format(
                    os.getpid(), weekly_sim_table_name)
                weekly_sim_table = searcher_finalizer.indexer.get_create_table(
                    weekly_sim_table_name, families={'s': dict()})
                searcher_finalizer.indexer.write_batch(batch_sim,
                                                       weekly_sim_table_name)

                ## Mark as done
                # mark precomp_sim true in escorts_images_sha1_infos_dev
                searcher_finalizer.indexer.write_batch(
                    batch_mark_precomp_sim,
                    searcher_finalizer.indexer.table_sha1infos_name)

            # mark info:precomp_finish in escorts_images_updates_dev
            if not corrupted:  # do not mark finished if we faced some issue? mark as corrupted?
                searcher_finalizer.indexer.write_batch(
                    [(update_id, {
                        searcher_finalizer.indexer.precomp_end_marker: 'True'
                    })], searcher_finalizer.indexer.table_updateinfos_name)

            print "[finalizer-pid({}): log] Finalize update {} at {} in {}s total.".format(
                os.getpid(), update_id, get_now(),
                time.time() - start_precomp)
            sys.stdout.flush()

            ## Cleanup
            if simname:
                try:
                    # remove simname
                    os.remove(simname)
                    # remove features file
                    featfirst = simname.split('sim')[0]
                    featfn = featfirst[:-1] + '.dat'
                    #print "[process_one_update: log] Removing file {}".format(featfn)
                    os.remove(featfn)
                except Exception as inst:
                    print "[finalizer-pid({}): error] Could not cleanup. Error was: {}".format(
                        os.getpid(), inst)
            #queueOut.task_done()
        except Exception as inst:
            #[finalizer: error] Caught error at 2017-04-14:04.29.23. Leaving. Error was: list index out of range
            print "[finalizer-pid({}): error] Caught error at {}. Error {} was: {}".format(
                os.getpid(), get_now(), type(inst), inst)