def main(argv, stop_after_init=False, preset_set_of_users=None): pickle_path_lptq = '/tmp/process_log_times_pq.bin.gz' pickle_path_clicks = '/tmp/process_log_clicks.bin.gz' pickle_path_clusters = '/tmp/process_log_clusters.dict.txt.gz' pickle_path_removed_queries = '/tmp/process_log_removed_queries.lst.txt.gz' pickle_path_big_queries_set = '/tmp/process_log_big_queries_set.lst.txt.gz' pickle_path_users = '/tmp/process_log_usets_set.lst.txt.gz' argc = len(argv) if argc <= len(CLI_ARGS): print "Usage: %s" % argv[0], ' '.join(CLI_ARGS) print "Currently missing parameters arguments:", ' '.join(CLI_ARGS[len(argv)-1:]) exit() global mdb_host mdb_host = argv[1].strip() filter_queries_file = argv[2].strip() allowed_users_file = argv[3].strip() log_filepath = argv[4].strip() clusters_file = argv[5].strip() queries_to_ids_file = argv[6].strip() t_init = time() # print "Starting... compute_everything()" # t0 = time() # everything = compute_everything(mdb_host, filter_queries_file, allowed_users_file) # removed_queries = everything['removed_queries'] # print "Done ", time()-t0 ######################################################################################################################## # We are going to do a lot of "is in allowed users?" so we need a set, not a list print "Loading users..." # users_set = set([int(line.strip()) for line in univ_open(allowed_users_file, mode='r')]) # We use compute_everything because it gets rid of the null-clusters queries before retrieving the list # of users, thus reducing the dataset overall, as queries are then retrieved from the users set t0 = time() global users_set if preset_set_of_users is not None: users_set = preset_set_of_users else: try: print "Trying to pickle from disk...", pickle_path_users with gzopen(pickle_path_users, 'r') as f: print "File", pickle_path_users, "was found!" users_set = set(load_pickled_list(f)) pickled = True except Exception as err: print "Error for", pickle_path_users, "was:", err # if not isinstance(err, IOError): print "No pickled files or error loading it, recomputing..." pickled = False # Note: here we use compute_everything because it will load the queries clusters OF THE INITIAL QUERIES only # remove the ones that have null clusterings # and then generate the list of users who queried the pruned list of queries # we do not direclty use the clusters from it, nor the queries, because we still have to remove the other queries # that have null clustering vectors. By "other queries" we mean not queries we use as the seed to select some user/data # any queries that is part of a user profile of one of the allowed users (the ones who queried the query list seed) # this bigger queries set is generated by load_big_query_set() in this file users_set = set(compute_everything(mdb_host, filter_queries_file, allowed_users_file)['users']) print "Done ", time()-t0 print "Total number of users that will be analyzed:", len(users_set) pickle_ask(pickled, pickle_path_users, users_set, dump_f=pickle_list) print "Done ", time()-t0 # everything = None # We are not using it afterwards, so, this should help the GC #################################################################################################################### # import itertoolsmodule as iter print "Computing the set of allowed queries..." t0 = time() try: print "Trying to pickle from disk...", pickle_path_big_queries_set with gzopen(pickle_path_big_queries_set, 'r') as f: big_queries_set = set(load_pickled_list(f)) pickled = True except Exception as err: if not isinstance(err, IOError): print "Error for", pickle_path_big_queries_set, "was:", err print "No pickled files or error loading it, recomputing..." pickled = False big_queries_set = load_big_query_set(log_filepath, users_set) print "Done ", time()-t0 print "Total number of queries that will be analyzed:", len(big_queries_set) pickle_ask(pickled, pickle_path_big_queries_set, big_queries_set, dump_f=pickle_list) #################################################################################################################### global clusters print "Pre-initializing clusters dict..." t0 = time() clusters = dict.fromkeys(big_queries_set) print "clusters now has", len(clusters), "keys" print "Done ", time()-t0 print "Retrieving big list of clusters for the", len(big_queries_set), "queries..." t0 = time() global clusters_loaded clusters_loaded = False p_clusters, mapres_clusters = run_in_bg_process(do_process_clusters_pickle, (pickle_path_clusters,)) def join_clusters(): p_clusters.join() global clusters, clusters_loaded if clusters_loaded: return clusters result = mapres_clusters.get()[0] if result is False: # The pickling from disk did not work, recompute it in place (join_clusters() is called when clusters are # NEEDED so we cannot wait/async this)) print "Error while pickling clusters from disk", pickle_path_clusters, ", recomputing..." t0 = time() result = do_process_clusters_recompute(big_queries_set, clusters_file, queries_to_ids_file, clusters) print "Done do_process_clusters_recompute()", time()-t0 # Any user input needs to be on the main thread, pickle ask will by itself send the pickling task to a bg # worker process if the user answers yes pickle_ask(False, pickle_path_clusters, result, dump_f=picke_dict) clusters_loaded = True clusters = result return clusters ######################################################################################################################## removed_queries = compute_removed_queries_because_of_null_clustering(pickle_path_removed_queries, clusters, join_clusters) print "Removed", len(removed_queries), "out of", len(big_queries_set) ######################################################################################################################## t1 = time() print "Launching process_log_clicks computation in a separated process" p_lpc, lpc_mapres = run_in_bg_process(process_log_clicks, (log_filepath, users_set, removed_queries)) p_lpc.close() ######################################################################################################################## ######################################################################################################################## print "Starting... process_log_times_pq()" t0 = time() try: print "Trying to pickle from disk...", pickle_path_lptq lptpq = pload(gzopen(pickle_path_lptq, 'rb')) pickled = True except Exception as err: if not isinstance(err, IOError): print "Error for", pickle_path_lptq, "was:", err print "No pickled files or error loading it, recomputing..." pickled = False lptpq = process_log_times_pq(log_filepath, users_set, removed_queries) print "Done process_log_times_pq() in", time()-t0 pickle_ask(pickled, pickle_path_lptq, lptpq) print "Starting... process_log_clicks()" t0 = time() # Note: Disabled the pickling as, for some reason, it does not work # and there is only ~15s difference between recomputation and pickling from disk anyway... # try: # print "Trying to pickle from disk..." # lpc = pload(open(pickle_path_clicks, 'rb')) # pickled = True # except Exception as err: # if not isinstance(err, IOError): # print "Error was:", err # print "No pickled files or error loading it, recomputing..." # pickled = False # lpc = process_log_clicks(log_filepath, users_set, removed_queries) ######################################################################################################################## ######################################################################################################################## print "waiting for the pool to finish, if not finished yet..." p_lpc.join() lpc = lpc_mapres.get()[0] print "Took a total time of", time()-t1, "or less" ######################################################################################################################## ######################################################################################################################## print "Done ", time()-t0 # pickle_ask(pickled, pickle_path_clicks, lpc) print "Some reprocessing..." # We need the clusters from now on, so let us wait for the children process to be finished and the data # transferred back to us join_clusters() print "Removing null-vectors clusters queries from `clusters`..." t0 = time() for qid in removed_queries: try: del clusters[qid] except KeyError: pass # If it was already not there, that's perfect print "Done ", time()-t0 t0 = time() for user_queries_dic in lpc.user_clicks_number: if user_queries_dic is None: continue del user_queries_dic['_id'] for user_queries_dic in lptpq.user_queries_number: if user_queries_dic is None: continue del user_queries_dic['_id'] print "Done ", time()-t0 # Deprecated, for now, but we might switch back to it so, keep it for now print "Computing number of users who issued the query, per query..." t0 = time() number_of_users_who_queried = dict.fromkeys(big_queries_set - removed_queries, 0) for query_dict in lptpq.user_queries_number: if query_dict is None: continue for qid in query_dict: number_of_users_who_queried[qid] += 1 print "Done ", time()-t0 print "Computing number of users who clicked, per query..." t0 = time() number_of_users_who_clicked = dict.fromkeys(big_queries_set - removed_queries, 0) for query_dict in lpc.user_clicks_number: if query_dict is None: continue for qid in query_dict: number_of_users_who_clicked[qid] += 1 print "Done ", time()-t0 # # GC big_queries_set = None removed_queries = None # print "Some reprocessing..." # t0 = time() # for user_queries_dic in lpc.user_clicks_number: # if user_queries_dic is None: # continue # del user_queries_dic['_id'] # for q in removed_queries: # try: # del user_queries_dic[q] # except KeyError: # # key was not there? fine, we did not need to delete it then # pass # for user_queries_dic in lptpq.user_queries_number: # if user_queries_dic is None: # continue # del user_queries_dic['_id'] # for q in removed_queries: # try: # del user_queries_dic[q] # except KeyError: # # key was not there? fine, we did not need to delete it then # pass print "Done ", time()-t0 print "Starting..." t0 = time() us.init( lpc.user_clicks_number, lptpq.user_queries_number, clusters, users_set, number_of_users_who_queried, number_of_users_who_clicked ) print "Done ", time()-t0 # Note: At this point in the main() execution, the script takes ~2.5G of RAM. print "Total initialization phase time:", time()-t_init if stop_after_init: return print "Initializing users similarity computation phase..." # Similarity computation benchmark: t0 = time() i = 0 global DATA_SET_SIZE DATA_SET_SIZE = len(users_set) # Note: a too small batch size will waste time respawning processes and re-generating the user_sim module cache # but a too high batch size will kill mongodb and the computer's RAM (as 1 batch size unit is 1 user computed by # the process and the process commits everything at once) print "Generating sorted users set..." print "Generating workers pool..." p = Pool(processes=POOL_SIZE) start_values = range(0, DATA_SET_SIZE, BATCH_SIZE) print "Mapping (launching) pool to", len(start_values), "different start_values", start_values t0 = time() p.map(compute_user_sim_batch, start_values) p.close() p.join() print "Workers finished in %.3f." % (time()-t0) # for u in users_set: # for u2 in users_set: # i += 1 # try: # us.sim(u, u2) # except KeyError as err: # print err # key = err.args[0] # print key, "in big_queries_set?", key in big_queries_set # print key, "in removed_queries?", key in removed_queries # print key, "in clusters?", key in clusters # res = False # for u_dict in lpc.user_clicks_number: # if u_dict is not None: # res |= (key in u_dict) # print key, "in clicks?", res # res = False # for u_dict in lptpq.user_queries_number: # if u_dict is not None: # res |= (key in u_dict) # print key, "in user_queries_number?", res # if i % 10000 is 0: # print i+1, "\t\tsim() calls in\t\t", time()-t0, "\t\taverage\t\t", (time()-t0)/float(i+1) raw_input("Now what?")
def top_k_queries(k, host, queries_filter_file, allowed_users_file, ids_mapping_file): queries_ids = compute_everything(host, queries_filter_file, allowed_users_file, k)['q_list'] queries_strings_indexed_by_id = [l.strip() for l in univ_open(ids_mapping_file)] queries_strings = [queries_strings_indexed_by_id[i] for i in queries_ids] return queries_strings