def main(stop_after_init=False): from sys import argv argc = len(argv) if argc <= len(CLI_ARGS): print 'Usage: %s %s %s' % (argv[0], ' '.join(CLI_ARGS), ' '.join(["[%s]" % x for x in OPTIONAL_ARGS])) print 'Currently missing parameters arguments:', ' '.join(CLI_ARGS[len(argv)-1:]) exit() files = [] for x in xrange(1, argc-1): files.append(argv[x].strip()) output_path = argv[-1].strip() t_init = time() t0 = time() print "Loading result files..." serps_combined = {} for crawl_result_file in files: print "Loading", crawl_result_file, "..." t1 = time() with univ_open(crawl_result_file, 'r') as f: merge_serps(serps_combined, jload(f)) print "Done in", time()-t1 print "All files done in", time()-t0 print "Writing URLs to output file", output_path, "..." t0 = time() jdump(serps_combined, univ_open(output_path, 'w+')) print "Done in", time()-t0 print "Script executed in", time() - t_init, "seconds"
def main(stop_after_init=False): from sys import argv argc = len(argv) if argc <= len(CLI_ARGS): print 'Usage: %s' % argv[0], ' '.join(CLI_ARGS) print 'Currently missing parameters arguments:', ' '.join(CLI_ARGS[len(argv)-1:]) exit() file_to_convert = argv[1].strip() mapping_file = argv[2].strip() output_path = argv[3].strip() t_init = time() query_strings_to_ids = {} with univ_open(mapping_file, 'r') as f: i = 0 for l in f: query_strings_to_ids[l.strip().lower()] = i i += 1 with univ_open(output_path, 'w+') as out: with univ_open(file_to_convert, 'r') as f: out.write('\n'.join([str(query_strings_to_ids[l.strip().lower()]) for l in f])) print "Script executed in", time() - t_init, "seconds"
def main(stop_after_init=False): from sys import argv argc = len(argv) if argc <= len(CLI_ARGS): print 'Usage: %s %s %s' % (argv[0], ' '.join(CLI_ARGS), ' '.join(["[%s]" % x for x in OPTIONAL_ARGS])) print 'Currently missing parameters arguments:', ' '.join(CLI_ARGS[len(argv)-1:]) exit() crawl_result_file = argv[1].strip() output_path = argv[2].strip() split_into_n = 1 if argc > len(CLI_ARGS) + 1: split_into_n = int(argv[len(CLI_ARGS) + 1]) t_init = time() t0 = time() print "Loading result file..." with univ_open(crawl_result_file, 'r') as f: serps = jload(f) print "Done in", time()-t0 t0 = time() print "Writing URLs to output file", output_path, "..." # the set() is because we do not need multiple times the same URL # in the seed, and from the SERP it is actually pretty likely to happen # on an example run, we went from 4070 urls to 2387 by adding the set() result = list(set([ \ url \ for query_serps in serps.values() \ for serp in query_serps \ for pos, url in serp['results'] ])) urls_n = len(result) batch_size = urls_n/split_into_n i = 0 # print urls_n, batch_size, split_into_n, (urls_n/split_into_n)*split_into_n for start in range(0, urls_n, batch_size): # print start i += 1 dir, fname = os.path.split(output_path) outp = os.path.join(dir, "%03d_%s" % (i, fname)) print "Dumping into", outp with univ_open(outp, 'w+') as out: out.write('\n'.join(result[start:start+batch_size])) print "Done in", time()-t0 print "Script executed in", time() - t_init, "seconds"
def load_mapping(filepath): mapping = {} with univ_open(filepath, 'r') as f: for line in f: queryid, urlid, domainid = map(int, line.strip().split(',')) mapping[(queryid, urlid)] = domainid return mapping
def process(self, **options): try: queries_filter = options["queries_filter"] except KeyError: queries_filter = None self.items_ranks = [0] * (self.MAX_ITEM_RANK + 1) with univ_open(self.log_filepath, mode="r") as f: for line in f: line_arr = line.strip().split("\t") if len(line_arr) is 5: # This is a clickthrough line user_id, keywords, date, item_rank, domain = line_arr try: item_rank = int(item_rank) except ValueError as err: print err print "The line that caused this error is the following:" print line continue # Skip this value as we cannot parse it anyway if queries_filter is not None and keywords not in queries_filter: # Skip this line, not in the filter continue try: self.items_ranks[item_rank] += 1 except IndexError: print "Wow! We got an item ranked", item_rank print "Please increase the MAX_ITEM_RANK value" exit() avg = sum([self.items_ranks[i] * i for i in xrange(1, len(self.items_ranks))]) / float( sum(self.items_ranks[1:]) ) return avg
def load_clusters_for_queries(q_set, clusters_path, queries_id_mapping_filepath, pre_initialized_dict=dict(), cp=None): """ Loads all the clustering vectors of the queries in the q_set parameter. :param q_set the set of queries to return the clustering vectofs of :param queries_id_mapping_filepath The path to the file providing a mapping from query string to query id :param clusters_path the path to the clusters file :param pre_initialized_dict if passed as a parameter, this dictionary will be used directly instead of instiantiating a new one, this can speed up computation if the dictionary already has keys initialized. :param {ClustersProcessor} cp : A pre-initialized ClusterProcessor to be used instead of initializing our own one :return a dictionary {query_id (int) => clustering vector (numpy.array)} """ if cp is None: cp = ClustersProcessor(clusters_path) t0 = time() cp.process() clusters = pre_initialized_dict with univ_open(queries_id_mapping_filepath, mode='r') as f: query_id = -1 # will represent both the query_id and the index in the list for line in f: query_id += 1 if query_id % 200000 is 0: print "Currently at query_id=", query_id if query_id not in q_set: continue clusters[query_id] = array(cp.cluster_vector_for_kw(line)) return clusters
def process(self, **options): self.clusters = [] with univ_open(self.filepath, mode='r') as f: i = 0 for line in f: i += 1 if i % 10 is 0: print "Currently at cluster\t", i # A cluster line has the following form: # Cluster0: longhorn=02404432-n#174#20 bullock=02403820-n#120#20 angus=02405929-n#41#20 line = line.strip().split(' ') line.pop(0) # Get rid of the "clusterX:" try: self.clusters.append( set([ \ kw.strip().split("=")[0].strip().lower() \ for kw in line ]) ) except IndexError as err: print err print "Cluster number", i, "IndexError was raised, cluster list is:" print line # Compute the overall set of keywords in the clusters: self.set_of_kw = set.union(*self.clusters)
def main(): import sys t0 = time() if len(sys.argv) < (len(CLI_ARGS)+1): print "Usage:", sys.argv[0], " ".join(CLI_ARGS), " ".join(OPT_ARGS) exit() # Run that immediately so that we crash on the stop if we cannot connect to the DB anyway from pymongo import MongoClient mdb_host = sys.argv[3].strip() mdb_conn = MongoClient(host=mdb_host) mdb = mdb_conn.users.clicks allowed_user_filepath = sys.argv[1].strip() log_filepath = sys.argv[2].strip() allowed_user_ids = set() with univ_open(allowed_user_filepath, 'r') as f: for line in f: allowed_user_ids.add(int(line.strip())) print "Loaded", len(allowed_user_ids), "allowed user ids." lp = process_logfile(log_filepath, allowed_user_ids) # Note that we drop the former data only now so that if something goes wrong during processing at least you still # have the former data print "Dropping previous DB" try: mdb.drop() mdb = mdb_conn.users.clicks except Exception as err: print type(err), err print "Dumping everything into MongoDB" t0 = time() batch_size = 200000 for i in xrange(batch_size, len(lp.user_clicks_number), batch_size): start = i-batch_size end = i print "Batch", start, end insert_sublist(mdb, lp.user_clicks_number[start:end]) # if len(lp.user_clicks_number) was not a multiple of batch_size, let us execute the last batch: if i is not len(lp.user_clicks_number)-1: print "Last batch..." insert_sublist(mdb, lp.user_clicks_number[i:]) print "Creating indexes..." mdb.ensure_index([('uid', 1)]) mdb.ensure_index([('qid', 1)]) mdb.ensure_index([('uid', 1), ('qid', 1)]) print "Closing MDB connection" mdb_conn.close() print "Done in", time()-t0 print "Terminating script."
def load_user_queries_ids_and_str_triplets(targets_file): queries_ids_to_str_mapping = {} queries_users_pairs = [] with univ_open(targets_file, 'r') as f: for line in f: qid, uid, qstr = line.strip().split('\t') qid, uid = int(qid), int(uid) queries_users_pairs.append((qid, uid)) queries_ids_to_str_mapping[qid] = qstr return queries_users_pairs, queries_ids_to_str_mapping
def process(self, **options): self.clicks = [0] * 60000000 allowed_query = options['allowed_query'] with univ_open(self.log_filepath, mode='r') as f: for line in f: line_arr = line.strip().split('\t') if len(line_arr) is 5: # This is a clickthrough line user_id = int(line_arr[0]) queryid = int(line_arr[1].strip()) if queryid != allowed_query: continue self.clicks[user_id] += 1
def process(self, **options): self.users_number_of_entries = [0] * self.MAX_USER_ID with univ_open(self.log_filepath, mode='r') as f: i = 0 for line in f: i += 1 if line.count("\t") is not 4: # This is not a click log line (it could just be a search log line) continue try: user_id = int(line[:line.index("\t")]) self.users_number_of_entries[user_id] += 1 except ValueError as e: print "Line number", i, "has invalid user id:", e
def load(g, original_ids_pm, fname): print "Loading file..." with univ_open(fname, 'r') as f: data = json.load(f) print "File loaded" vertices = [None] * MAX_URL_ID n = -1 n_e = -1 n0 = n n_e0 = n_e # comma_offset = 0 t0 = time() t1 = t0 # Note: Remember that the graph is stored in JSON but not under the standard dictionary-format {X: edges} # but using tuples: [(X, edges), (Y, edges), ...] # because lists can easily be write/read-streamed while dictionary it a little bit more complicated # and also it is much heavier to use dictionaries while in the end we mostly want to iter through this file anyway for node, edges in data: if vertices[node] is not None: v_node = g.vertex(vertices[node]) # log("Node", node, "already exists") else: # log("Creating node for", node) v_node = g.add_vertex() n += 1 vertices[node] = n original_ids_pm[v_node] = node # Register the original id of the node as a property of the node for e in edges: v = None if vertices[e] is not None: v = g.vertex(vertices[e]) # log("Node", e, "already exists") else: # log("Creating node for", e, "(", type(e), ") to create the corresponding edge") v = g.add_vertex() n += 1 vertices[e] = n original_ids_pm[v] = e # Register the actual id of the node as a property of the node n_e += 1 g.add_edge(v_node, v) if n % 10000 is 0: print "======" print "Loaded", n, "nodes in", time()-t0, ". Average:", n/(time()-t0), "nodes/s. Current pace:", (n-n0)/(time()-t1), "n/s" print "Loaded", n_e, "edges in", time()-t0, ". Average:", n_e/(time()-t0), "edges/s. Current pace:", (n_e-n_e0)/(time()-t1), "e/s" n0 = n n_e0 = n_e t1 = time() print "Loaded ", n, "nodes"
def load_big_query_set(log_filepath, allowed_users): """ Loads the set of all queries that have been issued by at least one of the allowed users (and thus are part of their profile) """ set_of_queries = set() with univ_open(log_filepath) as f: for line in f: line = line.strip().split('\t') n = len(line) if n is not 5 and n is not 3: continue if int(line[0]) in allowed_users: set_of_queries.add(int(line[1])) return set_of_queries
def process(self, **options): # Note: Althought we are not going to use all the indices in this list # we are still using a list in order to have fast direct element access # in order not to waste memory, though, we initialize everything to None # and then only initialize the base dictionary for the elements of the list # that are going to be used self.user_clicks_number = [None] * self.MAX_USER_ID try: excluded_qids = options['excluded_qids'] if options['excluded_qids'] is not None else set() except KeyError: excluded_qids = set() for _ in options['allowed_user_ids']: self.user_clicks_number[_] = {'_id': _} with univ_open(self.log_filepath, mode='r') as f: i = 0 for line in f: i += 1 if i % 500000 is 0: print "Currently at line", i line = line.strip().split('\t') if len(line) is not 5: # This is not a click log line (it could just be a search log line) continue try: user_id = int(line[0]) queryid = int(line[1].strip()) if queryid in excluded_qids: continue url_domain_id = int(line[4].strip()) if user_id in options['allowed_user_ids']: self.user_clicks_number[user_id][queryid][url_domain_id] = \ 1 + self.user_clicks_number[user_id] \ .setdefault(queryid, {url_domain_id: 0}) \ .setdefault(url_domain_id, 0) except ValueError as err: print "Line number", i, "has invalid user id:", err except KeyError as err: print "KeyError: ", err print user_id, queryid, url_domain_id exit()
def process(self, **options): """ /!\ If using options.serp_urls_uniqueness_ and lazy = False, serp_urls returned will be sorted """ # To be c/p-ed example of options # try: # self.serp_urls_uniqueness = options['serp_urls_uniqueness'] # except KeyError: # self.serp_urls_uniqueness = False # try: # self.lazy = options['lazy'] # except KeyError: # self.lazy = True # try: # self.return_serp_urls = options['return_serp_urls'] # except KeyError: # self.return_serp_urls = False self.entries = {} self.clicks = {} with univ_open(self.log_filepath, mode='r') as f: for line in f: line_arr = line.strip().split('\t') if len(line_arr) is 5: # This is a clickthrough line user_id, keywords, date, pos, domain = line_arr # try: # domain = domain[domain.index("//")+2:] # Getting rid of something:// # except ValueError: # Was not found, domain is specified without protocol? # pass # entry = (user_id, date, pos, domain) # try: # self.entries[keywords].append(entry) # except KeyError: # self.entries[keywords] = [entry] if keywords not in self.entries: self.entries[keywords] = {} try: self.entries[keywords][domain] += 1 except KeyError: self.entries[keywords][domain] = 1
def process(self, **options): # Note: Althought we are not going to use all the indices in this list # we are still using a list in order to have fast direct element access # in order not to waste memory, though, we initialize everything to None # and then only initialize at 0 the users that we encounter in the logs self.query_clicks_number = [None] * self.MAX_QUERY_ID try: excluded_qids = options['excluded_qids'] if options['excluded_qids'] is not None else set() except KeyError: excluded_qids = set() with univ_open(self.log_filepath, mode='r') as f: i = 0 for line in f: i += 1 if i % 500000 is 0: print "Currently at line", i line = line.strip().split('\t') try: user_id = int(line[0]) queryid = int(line[1].strip()) url_domain_id = int(line[4].strip()) if queryid in excluded_qids: continue except ValueError as err: print "Line number", i, "has invalid user id:", err except KeyError as err: print "KeyError: ", err print user_id, queryid, url_domain_id exit() if self.query_clicks_number[queryid] is None: self.query_clicks_number[queryid] = 0 if len(line) is 5: # This is a click log line (it could just be a search log line) self.query_clicks_number[queryid] += 1
def main(): import sys from sys import argv t_init = time() argc = len(sys.argv) if argc < (len(CLI_ARGS)+1): print "Usage:", argv[0], " ".join(CLI_ARGS), " ".join(OPT_ARGS) exit() input_file = argv[1] output_path = argv[2] t0 = time() print "Loading input file..." n_of_queries = {} with univ_open(input_file, 'r') as f: for line in f: # We multiply by 10^number of digits to keep and then # take its integer part so that all values Y.xxxy will be stored under YXXX and thus groupped together entropy_value = int(float(line.split(',')[2])*QUANTIZATION_FACTOR) try: n_of_queries[entropy_value] += 1 except KeyError: n_of_queries[entropy_value] = 1 print "Done in", time() - t0 print "Outputting result to", output_path, "..." with open(output_path, 'w+') as out: out.write("ClickEntropyScore,NOfQueries\n") out.write('\n'.join((OUTPUT_FORMAT % (i/float(QUANTIZATION_FACTOR), (i+1)/float(QUANTIZATION_FACTOR), n) for i, n in sorted(n_of_queries.items())))) print "File written." print "Tota: done in", time() - t_init print "Terminating script."
def top_k_queries(k, host, queries_filter_file, allowed_users_file, ids_mapping_file): queries_ids = compute_everything(host, queries_filter_file, allowed_users_file, k)['q_list'] queries_strings_indexed_by_id = [l.strip() for l in univ_open(ids_mapping_file)] queries_strings = [queries_strings_indexed_by_id[i] for i in queries_ids] return queries_strings
def main(): ################################################################################ import sys from time import time t0 = time() n_args = len(sys.argv) if n_args < (len(CLI_ARGS)+1): print "Usage:", sys.argv[0], " ".join(CLI_ARGS), " ".join(OPT_ARGS) exit() start_index, end_index = None, None if n_args > len(CLI_ARGS)+1: start_index = int(sys.argv[len(CLI_ARGS)+1].strip()) if n_args > len(CLI_ARGS)+2: end_index = int(sys.argv[len(CLI_ARGS)+2]) # Run that immediately so that we crash on the stop if we cannot connect to the DB anyway from pymongo import MongoClient mdb_host = sys.argv[3].strip() mdb_conn = MongoClient(host=mdb_host) mdb = mdb_conn.queries.clustering clusters_path = sys.argv[1].strip() queries_id_mapping_filepath = sys.argv[2].strip() ################################################################################ ################################################################################ cp = ClustersProcessor(clusters_path) t0 = time() cp.process() print "Loaded", len(cp.clusters), "clusters." ################################################################################ ################################################################################ if start_index is None and end_index is None: print "Dropping previous DB" try: mdb.drop() mdb = mdb_conn.queries.clustering except Exception as err: print type(err), err else: if start_index is None: # Remove everything up to the end_index where_clause = {'_id': {'$lte': end_index}} elif end_index is None: # Remove everything starting at start_index where_clause = {'_id': {'$gte': start_index}} else: # Remove everything between the bounds where_clause = {'$and': [{'_id': {'$gte': start_index}}, {'_id': {'$lte': end_index}}] } print "Removing documents with following where_clause=", where_clause mdb.remove(where_clause) ################################################################################ ################################################################################ t0 = time() # Note: 40k seems to be the limit, more than that and MongoDB will say "query is too large" batch_size = 40000 start = 0 end = 0 with univ_open(queries_id_mapping_filepath, mode='r') as f: queries_vectors = [] i = 0 for line in f: # Skip everything up to the starting index if start_index is not None and i < start_index: i += 1 continue # hop, we reached the end index, break there if end_index is not None and i > end_index: break queries_vectors.append(cp.cluster_vector_for_kw(line)) i += 1 if i % batch_size is 0: start = i - batch_size end = i sys.stdout.write("Committing batch %d %d...\t" % (start, end)) sys.stdout.flush() insert_with_ids_range(mdb, queries_vectors, start, end) print "done." queries_vectors = [] # GC? # end for line in f # if len(queries_vectors) was not a multiple of batch_size, let us execute the last batch: # not that the variable "end" will still have the last value that was assigned to it: # either 0 is the batch size is greater than the total size of data # or the last index that was committed if end is not len(queries_vectors)-1: print "Last batch..." insert_with_ids_range(mdb, queries_vectors, end, None) print "Committed", i, "vectors in", time()-t0 ################################################################################ ################################################################################ print "No index creations needed" ################################################################################ ################################################################################ print "Closing MDB connection" mdb_conn.close() ################################################################################ print "Done in", time()-t0 print "Terminating script."
def compute_everything(host, queries_filter_file, allowed_users_file, top_n=N_QUERIES, null_cluster_norm_threshold=ZERO_FLOAT): """ This function will take as input a list of allowed queries and users and output the users and queries after filtering the ones (users and queries) that have null clustering vector + will output some other useful information.. Inputs: see CLI arguments corresponding to most of the arguments of this method. top_n is maximum the number of queries that we keep, after having pruned the ones with null clustering vectors null_cluster_norm_threshold the is floating point threshold under which we consider a clustering vector to be null Outputs: users: the set of users that both queried at least once a query of queries_filter_file and are in the allowed_users_file set of users sums_of_clusters: the sum of all clusters of queries the user queried q_list: the list of queries that we kept, from queries_filter_file and by removing queries with null clustering vectors clusters: the clusters that we loaded, for the given queries removed_queries: queries with null clustering vector that we removed """ from univ_open import univ_open from numpy.linalg import norm from pymongo import MongoClient print "Connecting to MongoDB..." mdb_conn = MongoClient(host=host) init_mdb(mdb_conn) us.init_mdb(mdb_conn) print "Parsing filter files..." print "Queries list..." q_list = [int(line.strip()) for line in univ_open(queries_filter_file, mode='r')] print "Loaded", len(q_list), "seed queries." print "Retrieving queries clusters..." # Retrieve all the clusters of the queries clusters = us.clusters(q_list) # Check for null vectors... just in case... print "Checking for null clusters vectors..." removed_queries = [] for qid, cl in clusters.items(): if norm(cl) < ZERO_FLOAT: # Should be precise enough? print "Warning, query", qid print "Removing it" del clusters[qid] removed_queries.append(qid) print "Previously had", len(q_list), "queries. Now have", len(clusters), "queries" print "Taking the top", top_n,"out of them" q_list2 = [] for q in q_list: if q in clusters: q_list2.append(q) if len(q_list2) >= top_n: print "Reached the", top_n, "queries" break q_list = q_list2 print "Allowed users list..." allowed_users = set([int(line.strip()) for line in univ_open(allowed_users_file, mode='r')]) print "Parsed user file, processing it..." users_queries = {} for item in users_who_queried(q_list): uid = item['uid'] if uid in allowed_users: try: users_queries[uid].append(item['qid']) except KeyError: users_queries[uid] = [item['qid']] # Now, we are going to generate the list of pairs to compute similarity against, by only keeping pairs of users # that have at least one cluster in common, that is to say, that issued at least one query that has a cluster in # common with at least one query of the other user... users = users_queries.keys() n_users = len(users) # We compute all of them at once instead of doing it in the loop because if we did it in the loop we would end up # computing them multiple times (i times for the ith item of the 'users' list) print "Computing sums of clusters vectors per user... (", n_users, "users)" sums_of_clusters = [] for u in users: try: for qid in users_queries[u]: try: sums_of_clusters.append(sum(clusters[qid])) except KeyError as err: print err print "The qid=", qid, "was not found?!?" exit() except KeyError as err: print err print "The user="******"was not found?!?" exit() return { 'users': users, 'sums_of_clusters': sums_of_clusters, 'q_list': q_list, 'clusters': clusters, 'removed_queries': removed_queries, }
def process(self, **options): # Note: Althought we are not going to use all the indices in this list # we are still using a list in order to have fast direct element access # in order not to waste memory, though, we initialize everything to None # and then only initialize the base dictionary for the elements of the list # that are going to be used self.user_queries_number = [None] * self.MAX_USER_ID for _ in options['allowed_user_ids']: self.user_queries_number[_] = {'_id': _} try: excluded_qids = options['excluded_qids'] if options['excluded_qids'] is not None else set() except KeyError: excluded_qids = set() last_queryid = -1 with univ_open(self.log_filepath, mode='r') as f: i = 0 for line in f: i += 1 if i % 500000 is 0: print "Currently at line", i line = line.strip().split('\t') if len(line) is not 5\ and len(line) is not 3: # This is neither a click log line nor it could just be a search log line # Note that if there is no clicks, a "search log line" will appear in the logs # but if there were some clicks, this line will not appear and a "clickthrough log line" (or # several ones) will appear(s). As a consequence, We cannot simply count the number of "search" type # lines in order to know how many times the user issued the query # we also have to consider the click lines continue try: user_id = int(line[0]) if user_id in options['allowed_user_ids']: queryid = int(line[1].strip()) if queryid in excluded_qids: continue curr_time = mktime(datetime.strptime(line[2].strip(), '%Y-%m-%d %H:%M:%S').timetuple()) # When was it the last time that we saw a log line about this query? # Note: clickthrough log lines for the same query are contiguous last_time = self.user_queries_number[user_id] \ .setdefault(queryid, (0, float('-inf')))[1] # Has there be enough time since this last time for us to consider that the user # is really re-issuing the query and not just getting back to the SERP and clicking a new # link # we will also consider that if the user issued another query in between, and then went back # to this query it means it did re-issue this query # note that as the click log lines are groupped by user it seems, we just compare to the last # query id without having a user-specific last query id if abs(curr_time - last_time) >= SAME_QUERY_TIME_INTERVAL or last_queryid is not queryid: self.user_queries_number[user_id][queryid] = ( 1 + self.user_queries_number[user_id][queryid][0], curr_time ) else: # We still update the last time this guy queried this query in order to avoid # someone getting back every 5 seconds to the page 20 times being detected as re-issueing the # query just because the last time we registered was only the first time this guy went on the # page self.user_queries_number[user_id][queryid] = ( # Please note here there is no +1, of course self.user_queries_number[user_id][queryid][0], curr_time ) # And in any case, we also update the last query that we saw... of course last_queryid = queryid except ValueError as err: print "Line number", i, "has invalid user id:", err except KeyError as err: print "KeyError: ", err print user_id, queryid exit()
def main(stop_after_init=False): from sys import argv argc = len(argv) if argc <= len(CLI_ARGS): print 'Usage: %s %s %s' % (argv[0], ' '.join(CLI_ARGS), ' '.join(["[%s]" % x for x in OPTIONAL_ARGS])) print 'Currently missing parameters arguments:', ' '.join(CLI_ARGS[len(argv)-1:]) exit() serp_result_file = argv[1].strip() url_web_crawl_ids_mapping = argv[2].strip() output_path = argv[-1].strip() t_init = time() print "Loading SERP..." with univ_open(serp_result_file, 'r') as f: serps = jload(f) print "Loaded" print "Loading urls-to-ids dict..." urls_to_ids = {} with univ_open(url_web_crawl_ids_mapping, 'r') as f: i = 0 for line in f: line = line.strip().lower().replace("%0a", '') urls_to_ids[line] = i i += 1 print "Loaded" print "Converting SERP..." t0 = time() not_converted = set() total_urls = set() converted_set = set() for query_serps in serps.values(): for serp in query_serps: i = 0 while i < len(serp['results']): pos, url = serp['results'][i] url = url.lower().replace('%0a', '') total_urls.add(url) try: serp['results'][i] = (pos, urls_to_ids[url]) converted_set.add(url) except KeyError as err: # Looks like this URL has not been seen during the web crawl, as it has no assigned ID not_converted.add(url) serp['results'].pop(i) i -= 1 i += 1 print "Over", len(total_urls), "total different URLs from the SERP results,", len(not_converted), "could not be converted" if len(total_urls) - len(not_converted) < 600: print converted_set print "Done in", time()-t0 print "Writing URLs to output file", output_path, "..." t0 = time() with univ_open(output_path, 'w+') as out: jdump(serps, out) print "Done in", time()-t0 print "Script executed in", time() - t_init, "seconds"
def main(argv, no_compute=False, store_results=True): import sys # from sys import argv from time import time argc = len(argv) if argc <= len(CLI_ARGS): print "Usage: %s" % argv[0], ' '.join(CLI_ARGS) print "Currently missing parameters arguments:", ' '.join(CLI_ARGS[len(argv)-1:]) exit() pairs_path = argv[1].strip() mdb_host = argv[2].strip() t_init = time() with univ_open(pairs_path, 'r') as f: pairs = [tuple([int(_) for _ in l.strip().split(" ")]) for l in f] targeted_users = [p[1] for p in pairs] sim_users = load_sim_users(mdb_host, targeted_users) all_allowed_users = load_set_of_similar_users(mdb_host, targeted_users) | set(targeted_users) # This will load everything needed to compute sim() function in-ram similar_users(argv[1:], True, preset_set_of_users=all_allowed_users) print "We loaded", len(us.users_clicks_list_indexed_by_uid), "users' clicks information" # Init the sim_users array of the user_similiraty module us.init_sim_users(sim_users) if no_compute: return # We're ready to compute! for q, u in pairs: # Note: The collaborative score is not computed on the user's own clicked pages # but using the clicks of the users similar to him # so we have to compute scores for all pages clicked BY ITS SIMILAR USERS for sim, u_sim in sim_users[u]: try: pages = us.users_clicks_list_indexed_by_uid[u_sim][q].keys() except KeyError as err: key = int(err.message) if key == q and key != u_sim: # Well, this similar user just does not have queried this query... continue print "For u_sim, q", u_sim, q print "KeyError with value=", err continue if pages is None: print "?!?! The user", u_sim, "user similar to", u, "has no pages information." else: print "User", u_sim, "user similar to", u, "has", len(pages), "pages informations for query", q for page in pages: # try: sys.stdout.write("score(u=%s, q=%s, p=%s)=" % (u, q, page)) sys.stdout.flush() sys.stdout.write("%.5e\n" % us.score(q, page, u)) # except KeyError as err: # print "The KeyError is:", err # print "For some reason, the user", u, "probably is not in the sim_users cache" # print "Dump of the sim_users cache:" # print sim_users # raise err # except Exception as err: # print "Errors happen, this time it is:", err if store_results: print "Storing the computed scores in the DB" print "We are going to use the cache of the us module" print "The cache currently contains", len(us.scores), "entries" t0 = time() mdb = MongoClient(host=mdb_host) scores_vectors = {} for (q, p, u), score in us.scores.items(): if score == 0.0: # Nil score is the same as no score continue try: scores_vectors[(u, q)].append((p, score)) except KeyError: scores_vectors[(u, q)]= [(p, score)] print "Precomputation took", time()-t0 print "We have", len(scores_vectors), "'(u, q) -> score' entries to commit to the DB" print "Committing new one..." t0 = time() scores_to_commit = [] for (user, q), scores in scores_vectors.items(): print "Dropping previous information..." mdb.users.urls_perso_scores.remove({'uid': user, 'qid': q}) scores_to_commit.append( { 'uid': user, 'qid': q, 'vector': scores } \ ) mdb.users.urls_perso_scores.insert(scores_to_commit) print "Done, committing took", time()-t0 print "Stats:" for (user, q), scores in scores_vectors.items(): print "We have", len(scores), "scores for (q, u)=", (q, user) print "Script execution took", time()-t_init
def main(stop_after_init=False): from sys import argv argc = len(argv) if argc <= len(CLI_ARGS): print 'Usage: %s %s %s' % (argv[0], ' '.join(CLI_ARGS), ' '.join(["[%s]" % x for x in OPTIONAL_ARGS])) print 'Currently missing parameters arguments:', ' '.join(CLI_ARGS[len(argv)-1:]) exit() urls_from_logs_to_ids_file = argv[1].strip() web_crawl_urls_to_ids_file = argv[2].strip() output_path = argv[-1].strip() t_init = time() print "Loading domains string -> id mapping..." t0 = time() domains_string_to_ids = {} with univ_open(urls_from_logs_to_ids_file, 'r') as f: current_index = 0 for line in f: domains_string_to_ids[line.strip().lower().replace("%0a", '')] = current_index current_index += 1 print "Done in", time()-t0 print "Counting urls..." t0 = time() number_of_urls = 0 with univ_open(web_crawl_urls_to_ids_file, 'r') as f: # Note: I thought I'd use len(f.readlines()) but building this huge list in memory takes ages for nothing for l in f: number_of_urls += 1 print "Mapping URLs to their domain id...." web_crawl_urls_to_domain_ids = [None] * number_of_urls with univ_open(web_crawl_urls_to_ids_file, 'r') as f: f.readline() #1st line has no info current_index = 0 start_index = 0 # The second line contains no comma for line in f: line = line.strip().lower() if line == "]": continue line = jloads(line[start_index:]).replace("%0a", '') start_index = 1 domain = extract_domain(line) try: web_crawl_urls_to_domain_ids[current_index] = domains_string_to_ids[domain] except KeyError: pass # well, not found, then keep no values to assign current_index += 1 print "Done in", time()-t0 print "Writing URLs to output file", output_path, "..." t0 = time() with univ_open(output_path, 'w+') as out: out.write( "\n".join( "%d" % web_crawl_urls_to_domain_ids[i] if web_crawl_urls_to_domain_ids[i] is not None else "" for i in xrange(len(web_crawl_urls_to_domain_ids)) ) ) print "Done in", time()-t0 print "Script executed in", time() - t_init, "seconds"
log_filepath = sys.argv[1].strip() url_domain_ids_path = sys.argv[2].strip() output_path = sys.argv[3].strip() # If True, the resulting output will be a valid JSON list # Else, it will just be one line = one item list, ordered by index (no huge difference!) JSON_OUTPUT = False # Size of IO batch writes BATCH_SIZE = 10000 from time import time batch_str = '' items_in_queue = 0 t0 = time() with univ_open(output_path, 'wb') as outfile: print "Loading url_domain->ids mapping... (brace yourself, RAM!)" t0 = time() with univ_open(url_domain_ids_path, 'rb') as idmapfile: url_domain_ids = {} i = 0 for line in idmapfile: url_domain_ids[line.strip()] = str(i) i += 1 print "Done in", time()-t0 print "Starting logs parsing + I/O to output file", output_path t0 = time() with univ_open(log_filepath, mode='r') as f: for line in f: line = line.strip().split('\t')
def process(self, **options): # Note: Althought we are not going to use all the indices in this list # we are still using a list in order to have fast direct element access # in order not to waste memory, though, we initialize everything to None # and then only initialize at 0 the users that we encounter in the logs self.serps_to_logs_mapping = {} self.logs_clicked_domain = {} try: excluded_qids = options['excluded_qids'] if options['excluded_qids'] is not None else set() except KeyError: excluded_qids = set() web_crawl_urls_to_domain_ids = options['web_crawl_urls_to_domain_ids'] query_str_to_ids = options['query_str_to_ids'] serps = options['serps'] try: allowed_queries = options['allowed_queries'] except KeyError: allowed_queries = None print "Warning, none set of allowed queries" with univ_open(self.log_filepath, mode='r') as f: i = 0 for line in f: i += 1 if i % 500000 is 0: print "Currently at line", i line = line.strip().split('\t') if len(line) is not 5: # This is not a click log line (it could just be a search log line) continue try: user_id = int(line[0]) queryid = int(line[1].strip()) position = int(line[3].strip()) url_domain_id = int(line[4].strip()) except ValueError as err: print i, "has invalid user, query, position or url_domain id:", err except KeyError as err: print "KeyError: ", err print queryid, url_domain_id, user_id exit() if user_id not in options['allowed_users']: continue if (allowed_queries is not None and queryid not in allowed_queries) or queryid in excluded_qids: continue self.logs_clicked_domain.setdefault(queryid, {}).setdefault(position, []).append(url_domain_id) print "Result of clicks (pos, domain) gathering:" print self.logs_clicked_domain clicks = list() urls_serps = list() mapped = list() for queryid, click_entries in self.logs_clicked_domain.items(): query_str = query_str_to_ids[queryid] if query_str not in serps: print "[INFO] Query", query_str, "was not in the serps results" continue for position, domain_ids in sorted(click_entries.items()): for domain_id in domain_ids: clicks.append(domain_id) for serp in serps[query_str]: i = 0 while i < len(serp['results']): pos, urlid = serp['results'][i] urls_serps.append(urlid) urlid_domain_id = web_crawl_urls_to_domain_ids[urlid] if domain_id == urlid_domain_id: mapped.append((queryid, urlid)) if (queryid, urlid) in self.serps_to_logs_mapping: print (queryid, urlid), "was already mapped to", self.serps_to_logs_mapping[(queryid, urlid)] print "Remapping it to", domain_id self.serps_to_logs_mapping[(queryid, urlid)] = domain_id serp['results'].pop(i) # popping it so that we do not insert it twice i -= 1 i += 1 print "We went through", len(set(clicks)), "different domain ids (", \ len(clicks), "total (domain, pos) entries)" \ , "from the logs,", \ len(set(urls_serps)), "different urlids (", len(urls_serps), "(pos,urlids) entries in total)", \ "from the SERPs and we mapped", len(mapped), "URLs,", len(set(mapped)), "different ones"
def main(): # TODO: # load the requeried SERPs and store them as follows: # (query_id, [complete_list_of_urls_from_serps_in_order_of_appearance]) # then when we go through the users' clicks, we do: # index = urls_from_serps[queryid].find(current_domain, lamdba: extract_domain()) # urls_from_serps.pop(index) # somehow get the info about the domain ID in the logs, make the mapping, etc. ... import sys from sys import argv t_init = time() argc = len(sys.argv) if argc < (len(CLI_ARGS)+1): print "Usage:", argv[0], " ".join(CLI_ARGS), " ".join(OPT_ARGS) exit() logfile = argv[1] mdb_host = argv[2] queries_users_file = argv[3] query_str_to_ids_mapping_file = argv[4] serp_requery_result_file = argv[5] web_crawl_graph_url_to_domain_ids_mapping_file = argv[6] output_path = argv[-1] print "Loading target users..." queries_users_pairs = [tuple(map(int, _.strip().split(' '))) for _ in univ_open(queries_users_file, 'r')] targeted_users_set = set([_[1] for _ in queries_users_pairs]) targeted_queries_set = set([_[0] for _ in queries_users_pairs]) print "Loaded", len(targeted_users_set), "target users." print "Loading the set of their top similar users..." set_of_similar_users = load_set_of_similar_users(mdb_host, targeted_users_set) print "Loaded a total of", len(set_of_similar_users), "new allowed users" allowed_users = set_of_similar_users | targeted_users_set print len(allowed_users), "users allowed in total" print "Loading SERP file..." t0 = time() serps = jload(univ_open(serp_requery_result_file, 'r')) print "Done in", time() - t0 print "Loading IDs <-> query strings..." t0 = time() queries_str_indexed_by_ids = [_.strip() for _ in univ_open(query_str_to_ids_mapping_file, 'r')] print "Done in", time() - t0 print "Loading (web crawl) URL IDs <-> (logs) domain ids..." t0 = time() web_crawl_urls_to_domain_ids = [int(_.strip()) if _ != '\n' else None for _ in univ_open(web_crawl_graph_url_to_domain_ids_mapping_file, 'r')] print "Done in", time() - t0 lp = LogProcessor(logfile) t0 = time() print "Starting process..." lp.process( serps=serps, allowed_queries=targeted_queries_set, allowed_users=allowed_users, query_str_to_ids=queries_str_indexed_by_ids, web_crawl_urls_to_domain_ids=web_crawl_urls_to_domain_ids ) print "Done in", time() - t0 print "Outputting result to", output_path, "..." with open(output_path, 'w+') as out: # q, urlid, domainid # meaning (q, urlid) -> domainid out.write('\n'.join(("%d,%d,%d" % (item[0][0], item[0][1], item[1]) for item in lp.serps_to_logs_mapping.items()))) print "Done in", time() - t_init print "Terminating script."
clusters = load_clusters_for_queries( qs, clusters_path, queries_id_mapping_filepath ) print "Computing the queries that we can remove..." removed_queries = compute_removed_queries_because_of_null_clustering('/tmp/blorg', clusters) print "We can remove", len(removed_queries), "from this set" final_set = qs - removed_queries print "That makes the total set of size:", len(final_set) # Now get the real query strings: print "Converting from queryids back to query strings..." query_strings_set = set() with univ_open(queries_id_mapping_filepath, mode='r') as f: queryid = 0 for l in f: if queryid in final_set: query_strings_set.add(l.strip().lower()) queryid += 1 # lines numbers are the ids of the queries output_path = pjoin(HOME, 'cppr/data/serp_crawl_queries_to_be_reissued_similar_user_profiles_queries.lst') print "Outputting result to", output_path, "..." with univ_open(output_path, mode='w+') as out: out.write('\n'.join(query_strings_set)) print "Done"
def load(path): with univ_open(path, 'r') as f: return dict( (map(int, l.strip().split(' ')) for l in f) )
def pickle_open_and_write(pickle_path, data, dump_f=pdump): t0 = time() with univ_open(pickle_path, 'wb+') as f: dump_f(data, f) print "Done ", time()-t0