Python univ_open Examples, univ_open.univ_open Python Examples

Example #1

0

Show file

File: merge_aol_requeries_serps.py Project: tdubourg/collaborative-personalized-pagerank-public

def main(stop_after_init=False):
    from sys import argv

    argc = len(argv)
    if argc <= len(CLI_ARGS):
        print 'Usage: %s %s %s' % (argv[0], ' '.join(CLI_ARGS), ' '.join(["[%s]" % x for x in OPTIONAL_ARGS]))
        print 'Currently missing parameters arguments:', ' '.join(CLI_ARGS[len(argv)-1:])
        exit()

    files = []
    for x in xrange(1, argc-1):
        files.append(argv[x].strip())

    output_path         = argv[-1].strip()

    t_init = time()
    t0 = time()
    print "Loading result files..."
    serps_combined = {}
    for crawl_result_file in files:
        print "Loading", crawl_result_file, "..."
        t1 = time()
        with univ_open(crawl_result_file, 'r') as f:
            merge_serps(serps_combined, jload(f))
        print "Done in", time()-t1
    print "All files done in", time()-t0

    print "Writing URLs to output file", output_path, "..."
    t0 = time()
    jdump(serps_combined, univ_open(output_path, 'w+'))
    print "Done in", time()-t0

    print "Script executed in", time() - t_init, "seconds"

Example #2

0

Show file

File: convert_query_strings_list_to_ids.py Project: tdubourg/collaborative-personalized-pagerank-public

def main(stop_after_init=False):
    from sys import argv

    argc = len(argv)
    if argc <= len(CLI_ARGS):
        print 'Usage: %s'  % argv[0], ' '.join(CLI_ARGS)
        print 'Currently missing parameters arguments:', ' '.join(CLI_ARGS[len(argv)-1:])
        exit()

    file_to_convert = argv[1].strip()
    mapping_file    = argv[2].strip()
    output_path     = argv[3].strip()

    t_init = time()
    query_strings_to_ids = {}
    with univ_open(mapping_file, 'r') as f:
        i = 0
        for l in f:
            query_strings_to_ids[l.strip().lower()] = i
            i += 1

    with univ_open(output_path, 'w+') as out:
        with univ_open(file_to_convert, 'r') as f:
            out.write('\n'.join([str(query_strings_to_ids[l.strip().lower()]) for l in f]))

    print "Script executed in", time() - t_init, "seconds"

Example #3

0

Show file

File: aol_requery_crawl_to_urls_seed.py Project: tdubourg/collaborative-personalized-pagerank-public

def main(stop_after_init=False):
    from sys import argv

    argc = len(argv)
    if argc <= len(CLI_ARGS):
        print 'Usage: %s %s %s' % (argv[0], ' '.join(CLI_ARGS), ' '.join(["[%s]" % x for x in OPTIONAL_ARGS]))
        print 'Currently missing parameters arguments:', ' '.join(CLI_ARGS[len(argv)-1:])
        exit()

    crawl_result_file   = argv[1].strip()
    output_path         = argv[2].strip()
    split_into_n        = 1

    if argc > len(CLI_ARGS) + 1:
        split_into_n = int(argv[len(CLI_ARGS) + 1])

    t_init = time()
    t0 = time()
    print "Loading result file..."
    with univ_open(crawl_result_file, 'r') as f:
        serps = jload(f)
    print "Done in", time()-t0

    t0 = time()
    print "Writing URLs to output file", output_path, "..."
    # the set() is because we do not need multiple times the same URL
    # in the seed, and from the SERP it is actually pretty likely to happen 
    # on an example run, we went from 4070 urls to 2387 by adding the set()
    result = list(set([ \
        url \
        for query_serps in serps.values() \
        for serp in query_serps \
        for pos, url in serp['results']
    ]))
    urls_n = len(result)
    batch_size = urls_n/split_into_n
    i = 0
    # print urls_n, batch_size, split_into_n, (urls_n/split_into_n)*split_into_n
    for start in range(0, urls_n, batch_size):
        # print start
        i += 1
        dir, fname = os.path.split(output_path)
        outp = os.path.join(dir, "%03d_%s" % (i, fname))
        print "Dumping into", outp
        with univ_open(outp, 'w+') as out:
            out.write('\n'.join(result[start:start+batch_size]))
    print "Done in", time()-t0

    print "Script executed in", time() - t_init, "seconds"

Example #4

0

Show file

File: load_query_url_to_domain_log_id_mapping.py Project: tdubourg/collaborative-personalized-pagerank-public

def load_mapping(filepath):
    mapping = {}
    with univ_open(filepath, 'r') as f:
        for line in f:
            queryid, urlid, domainid = map(int, line.strip().split(','))
            mapping[(queryid, urlid)] = domainid
    return mapping

Example #5

0

Show file

File: item_rank_distribution.py Project: tdubourg/collaborative-personalized-pagerank-public

    def process(self, **options):
        try:
            queries_filter = options["queries_filter"]
        except KeyError:
            queries_filter = None

        self.items_ranks = [0] * (self.MAX_ITEM_RANK + 1)
        with univ_open(self.log_filepath, mode="r") as f:
            for line in f:
                line_arr = line.strip().split("\t")
                if len(line_arr) is 5:  # This is a clickthrough line
                    user_id, keywords, date, item_rank, domain = line_arr
                    try:
                        item_rank = int(item_rank)
                    except ValueError as err:
                        print err
                        print "The line that caused this error is the following:"
                        print line
                        continue  # Skip this value as we cannot parse it anyway

                    if queries_filter is not None and keywords not in queries_filter:
                        # Skip this line, not in the filter
                        continue
                    try:
                        self.items_ranks[item_rank] += 1
                    except IndexError:
                        print "Wow! We got an item ranked", item_rank
                        print "Please increase the MAX_ITEM_RANK value"
                        exit()
        avg = sum([self.items_ranks[i] * i for i in xrange(1, len(self.items_ranks))]) / float(
            sum(self.items_ranks[1:])
        )
        return avg

Example #6

0

Show file

File: compute_top_similar_users_in_ram.py Project: tdubourg/collaborative-personalized-pagerank-public

def load_clusters_for_queries(q_set, clusters_path, queries_id_mapping_filepath, pre_initialized_dict=dict(), cp=None):
    """
        Loads all the clustering vectors of the queries in the q_set parameter.
        :param q_set the set of queries to return the clustering vectofs of
        :param queries_id_mapping_filepath The path to the file providing a mapping from query string to query id
        :param clusters_path the path to the clusters file
        :param pre_initialized_dict if passed as a parameter, this dictionary will be used directly instead of 
            instiantiating a new one, this can speed up computation if the dictionary already has keys initialized.
        :param {ClustersProcessor} cp : A pre-initialized ClusterProcessor to be used instead of initializing our own one
        :return a dictionary {query_id (int) => clustering vector (numpy.array)}
    """
    if cp is None:
        cp = ClustersProcessor(clusters_path)
    t0 = time()
    cp.process()
    clusters = pre_initialized_dict
    with univ_open(queries_id_mapping_filepath, mode='r') as f:
        query_id = -1  # will represent both the query_id and the index in the list
        for line in f:
            query_id += 1
            if query_id % 200000 is 0:
                print "Currently at query_id=", query_id
            if query_id not in q_set:
                continue
            clusters[query_id] = array(cp.cluster_vector_for_kw(line))
    return clusters

Example #7

0

Show file

File: store_queries_clustering.py Project: tdubourg/collaborative-personalized-pagerank-public

    def process(self, **options):
        self.clusters = []
        with univ_open(self.filepath, mode='r') as f:
            i = 0
            for line in f:
                i += 1
                if i % 10 is 0:
                    print "Currently at cluster\t", i
                # A cluster line has the following form:
                # Cluster0: longhorn=02404432-n#174#20 bullock=02403820-n#120#20 angus=02405929-n#41#20 
                line = line.strip().split(' ')
                line.pop(0)  # Get rid of the "clusterX:"

                try:
                    self.clusters.append(
                        set([ \
                            kw.strip().split("=")[0].strip().lower() \
                            for kw in line
                        ])
                    )
                except IndexError as err:
                    print err
                    print "Cluster number", i, "IndexError was raised, cluster list is:"
                    print line
        # Compute the overall set of keywords in the clusters:
        self.set_of_kw = set.union(*self.clusters)

Example #8

0

Show file

File: store_users_clicks.py Project: tdubourg/collaborative-personalized-pagerank-public

def main():
    import sys
    t0 = time()

    if len(sys.argv) < (len(CLI_ARGS)+1):
        print "Usage:", sys.argv[0], " ".join(CLI_ARGS), " ".join(OPT_ARGS)
        exit()

    # Run that immediately so that we crash on the stop if we cannot connect to the DB anyway
    from pymongo import MongoClient
    mdb_host = sys.argv[3].strip()
    mdb_conn = MongoClient(host=mdb_host)
    mdb = mdb_conn.users.clicks

    allowed_user_filepath   = sys.argv[1].strip()
    log_filepath            = sys.argv[2].strip()

    allowed_user_ids = set()
    with univ_open(allowed_user_filepath, 'r') as f:
        for line in f:
            allowed_user_ids.add(int(line.strip()))

    print "Loaded", len(allowed_user_ids), "allowed user ids."
    
    lp = process_logfile(log_filepath, allowed_user_ids)

    # Note that we drop the former data only now so that if something goes wrong during processing at least you still
    # have the former data
    print "Dropping previous DB"
    try:
        mdb.drop()
        mdb = mdb_conn.users.clicks
    except Exception as err:
        print type(err), err
    
    print "Dumping everything into MongoDB"
    t0 = time()
    batch_size = 200000
    for i in xrange(batch_size, len(lp.user_clicks_number), batch_size):
        start = i-batch_size
        end = i
        print "Batch", start, end
        insert_sublist(mdb, lp.user_clicks_number[start:end])
    
    # if len(lp.user_clicks_number) was not a multiple of batch_size, let us execute the last batch:
    if i is not len(lp.user_clicks_number)-1:
        print "Last batch..."
        insert_sublist(mdb, lp.user_clicks_number[i:])

    print "Creating indexes..."
    mdb.ensure_index([('uid', 1)])
    mdb.ensure_index([('qid', 1)])
    mdb.ensure_index([('uid', 1), ('qid', 1)])

    print "Closing MDB connection"
    mdb_conn.close()
    print "Done in", time()-t0
    print "Terminating script."

Example #9

0

Show file

File: generate_store_serps.py Project: tdubourg/collaborative-personalized-pagerank-public

def load_user_queries_ids_and_str_triplets(targets_file):
    queries_ids_to_str_mapping = {}
    queries_users_pairs = []
    with univ_open(targets_file, 'r') as f:
        for line in f:
            qid, uid, qstr = line.strip().split('\t')
            qid, uid = int(qid), int(uid)
            queries_users_pairs.append((qid, uid))
            queries_ids_to_str_mapping[qid] = qstr

    return queries_users_pairs, queries_ids_to_str_mapping

Example #10

0

Show file

File: choose_better_user.py Project: tdubourg/collaborative-personalized-pagerank-public

 def process(self, **options):
     self.clicks = [0] * 60000000
     allowed_query = options['allowed_query']
     with univ_open(self.log_filepath, mode='r') as f:
         for line in f:
             line_arr = line.strip().split('\t')
             if len(line_arr) is 5: # This is a clickthrough line
                 user_id = int(line_arr[0])
                 queryid = int(line_arr[1].strip())
                 if queryid != allowed_query:
                     continue
                 self.clicks[user_id] += 1

Example #11

0

Show file

File: users_statistics.py Project: tdubourg/collaborative-personalized-pagerank-public

 def process(self, **options):
     self.users_number_of_entries = [0] * self.MAX_USER_ID
     with univ_open(self.log_filepath, mode='r') as f:
         i = 0
         for line in f:
             i += 1
             if line.count("\t") is not 4:  # This is not a click log line (it could just be a search log line)
                 continue
             try:
                 user_id = int(line[:line.index("\t")])
                 self.users_number_of_entries[user_id] += 1
             except ValueError as e:
                 print "Line number", i, "has invalid user id:", e

Example #12

0

Show file

File: prepare_graph.py Project: tdubourg/collaborative-personalized-pagerank-public

def load(g, original_ids_pm, fname):
    print "Loading file..."
    with univ_open(fname, 'r') as f:
        data = json.load(f)
    print "File loaded"
    vertices = [None] * MAX_URL_ID
    n = -1
    n_e = -1
    n0 = n
    n_e0 = n_e
    # comma_offset = 0
    t0 = time()
    t1 = t0
    # Note: Remember that the graph is stored in JSON but not under the standard dictionary-format {X: edges}
    # but using tuples: [(X, edges), (Y, edges), ...]
    # because lists can easily be write/read-streamed while dictionary it a little bit more complicated
    # and also it is much heavier to use dictionaries while in the end we mostly want to iter through this file anyway
    for node, edges in data:
        if vertices[node] is not None:
            v_node = g.vertex(vertices[node])
            # log("Node", node, "already exists")
        else:
            # log("Creating node for", node)
            v_node = g.add_vertex()
            n += 1
            vertices[node] = n
            original_ids_pm[v_node] = node  # Register the original id of the node as a property of the node
            
        for e in edges:
            v = None
            if vertices[e] is not None:
                v = g.vertex(vertices[e])
                # log("Node", e, "already exists")
            else:
                # log("Creating node for", e, "(", type(e), ") to create the corresponding edge")
                v = g.add_vertex()
                n += 1
                vertices[e] = n
                original_ids_pm[v] = e  # Register the actual id of the node as a property of the node
            n_e += 1
            g.add_edge(v_node, v)

        if n % 10000 is 0:
            print "======"
            print "Loaded", n, "nodes in", time()-t0, ". Average:", n/(time()-t0), "nodes/s. Current pace:", (n-n0)/(time()-t1), "n/s"
            print "Loaded", n_e, "edges in", time()-t0, ". Average:", n_e/(time()-t0), "edges/s. Current pace:", (n_e-n_e0)/(time()-t1), "e/s"
            n0 = n
            n_e0 = n_e
            t1 = time()
    print "Loaded ", n, "nodes"

Example #13

0

Show file

File: compute_top_similar_users_in_ram.py Project: tdubourg/collaborative-personalized-pagerank-public

def load_big_query_set(log_filepath, allowed_users):
    """
        Loads the set of all queries that have been issued by at least one of the allowed users 
        (and thus are part of their profile)
    """
    set_of_queries = set()
    with univ_open(log_filepath) as f:
        for line in f:
            line = line.strip().split('\t')
            n = len(line)
            if n is not 5 and n is not 3:
                continue
            if int(line[0]) in allowed_users:
                set_of_queries.add(int(line[1]))
    return set_of_queries

Example #14

0

Show file

File: store_users_clicks.py Project: tdubourg/collaborative-personalized-pagerank-public

    def process(self, **options):
        # Note: Althought we are not going to use all the indices in this list
        # we are still using a list in order to have fast direct element access
        # in order not to waste memory, though, we initialize everything to None
        # and then only initialize the base dictionary for the elements of the list
        # that are going to be used
        self.user_clicks_number = [None] * self.MAX_USER_ID
        try:
            excluded_qids = options['excluded_qids'] if options['excluded_qids'] is not None else set()
        except KeyError:
            excluded_qids = set()

        for _ in options['allowed_user_ids']:
            self.user_clicks_number[_] = {'_id': _}

        with univ_open(self.log_filepath, mode='r') as f:
            i = 0
            for line in f:
                i += 1
                if i % 500000 is 0:
                    print "Currently at line", i
                line = line.strip().split('\t')
                if len(line) is not 5:  # This is not a click log line (it could just be a search log line)
                    continue
                try:
                    user_id = int(line[0])
                    queryid = int(line[1].strip())
                    if queryid in excluded_qids:
                        continue
                    url_domain_id = int(line[4].strip())
                    
                    if user_id in options['allowed_user_ids']:
                        self.user_clicks_number[user_id][queryid][url_domain_id] = \
                                1 + self.user_clicks_number[user_id] \
                                .setdefault(queryid, {url_domain_id: 0}) \
                                .setdefault(url_domain_id, 0)
                except ValueError as err:
                    print "Line number", i, "has invalid user id:", err
                except KeyError as err:
                    print "KeyError: ", err
                    print user_id, queryid, url_domain_id
                    exit()

Example #15

0

Show file

File: clicks_analysis.py Project: tdubourg/collaborative-personalized-pagerank-public

    def process(self, **options):
        """

        /!\ If using options.serp_urls_uniqueness_ and lazy = False, serp_urls returned will be sorted

        """
        # To be c/p-ed example of options
        # try:
        #     self.serp_urls_uniqueness = options['serp_urls_uniqueness']
        # except KeyError:
        #     self.serp_urls_uniqueness = False
        # try:
        #     self.lazy = options['lazy']
        # except KeyError:
        #     self.lazy = True
        # try:
        #     self.return_serp_urls = options['return_serp_urls']
        # except KeyError:
        #     self.return_serp_urls = False
        
        self.entries = {}
        self.clicks = {}
        with univ_open(self.log_filepath, mode='r') as f:
            for line in f:
                line_arr = line.strip().split('\t')
                if len(line_arr) is 5: # This is a clickthrough line
                    user_id, keywords, date, pos, domain = line_arr
                    # try:
                    #      domain = domain[domain.index("//")+2:] # Getting rid of something://
                    # except ValueError: # Was not found, domain is specified without protocol?
                    #      pass
                    # entry = (user_id, date, pos, domain)
                    # try:
                    #     self.entries[keywords].append(entry)
                    # except KeyError:
                    #     self.entries[keywords] = [entry]
                    if keywords not in self.entries:
                    	self.entries[keywords] = {}
                    try:
                        self.entries[keywords][domain] += 1
                    except KeyError:
                        self.entries[keywords][domain] = 1

Example #16

0

Show file

File: queries_clicks_distribution.py Project: tdubourg/collaborative-personalized-pagerank-public

    def process(self, **options):
        # Note: Althought we are not going to use all the indices in this list
        # we are still using a list in order to have fast direct element access
        # in order not to waste memory, though, we initialize everything to None
        # and then only initialize at 0 the users that we encounter in the logs
        self.query_clicks_number = [None] * self.MAX_QUERY_ID
        try:
            excluded_qids = options['excluded_qids'] if options['excluded_qids'] is not None else set()
        except KeyError:
            excluded_qids = set()

        with univ_open(self.log_filepath, mode='r') as f:
            i = 0
            for line in f:
                i += 1
                if i % 500000 is 0:
                    print "Currently at line", i
                line = line.strip().split('\t')
                try:
                    user_id = int(line[0])
                    queryid = int(line[1].strip())
                    url_domain_id = int(line[4].strip())
                    if queryid in excluded_qids:
                        continue
                except ValueError as err:
                    print "Line number", i, "has invalid user id:", err
                except KeyError as err:
                    print "KeyError: ", err
                    print user_id, queryid, url_domain_id
                    exit()
                    
                if self.query_clicks_number[queryid] is None:
                    self.query_clicks_number[queryid] = 0

                if len(line) is 5:  # This is a click log line (it could just be a search log line)
                    self.query_clicks_number[queryid] += 1

Example #17

0

Show file

File: aggregate_queries_by_click_entropy.py Project: tdubourg/collaborative-personalized-pagerank-public

def main():
    import sys
    from sys import argv
    t_init = time()

    argc = len(sys.argv)

    if argc < (len(CLI_ARGS)+1):
        print "Usage:", argv[0], " ".join(CLI_ARGS), " ".join(OPT_ARGS)
        exit()

    input_file  = argv[1]
    output_path = argv[2]

    t0 = time()
    print "Loading input file..."
    n_of_queries = {}
    with univ_open(input_file, 'r') as f:
        for line in f:
            # We multiply by 10^number of digits to keep and then
            # take its integer part so that all values Y.xxxy will be stored under YXXX and thus groupped together
            entropy_value = int(float(line.split(',')[2])*QUANTIZATION_FACTOR)
            try:
                n_of_queries[entropy_value] += 1
            except KeyError:
                n_of_queries[entropy_value] = 1
    print "Done in", time() - t0

    print "Outputting result to", output_path, "..."
    with open(output_path, 'w+') as out:
        out.write("ClickEntropyScore,NOfQueries\n")
        out.write('\n'.join((OUTPUT_FORMAT % (i/float(QUANTIZATION_FACTOR), (i+1)/float(QUANTIZATION_FACTOR), n) for i, n in sorted(n_of_queries.items()))))
    print "File written."

    print "Tota: done in", time() - t_init
    print "Terminating script."

Example #18

0

Show file

File: select_top_k_queries.py Project: tdubourg/collaborative-personalized-pagerank-public

def top_k_queries(k, host, queries_filter_file, allowed_users_file, ids_mapping_file):
    queries_ids = compute_everything(host, queries_filter_file, allowed_users_file, k)['q_list']
    queries_strings_indexed_by_id = [l.strip() for l in univ_open(ids_mapping_file)]
    queries_strings = [queries_strings_indexed_by_id[i] for i in queries_ids]
    return queries_strings

Example #19

0

Show file

File: store_queries_clustering.py Project: tdubourg/collaborative-personalized-pagerank-public

def main():
    ################################################################################
    import sys
    from time import time
    t0 = time()

    n_args = len(sys.argv)
    if n_args < (len(CLI_ARGS)+1):
        print "Usage:", sys.argv[0], " ".join(CLI_ARGS), " ".join(OPT_ARGS)
        exit()

    start_index, end_index = None, None

    if n_args > len(CLI_ARGS)+1:
        start_index = int(sys.argv[len(CLI_ARGS)+1].strip())

    if n_args > len(CLI_ARGS)+2:
        end_index = int(sys.argv[len(CLI_ARGS)+2])

    # Run that immediately so that we crash on the stop if we cannot connect to the DB anyway
    from pymongo import MongoClient
    mdb_host = sys.argv[3].strip()
    mdb_conn = MongoClient(host=mdb_host)
    mdb = mdb_conn.queries.clustering

    clusters_path = sys.argv[1].strip()
    queries_id_mapping_filepath = sys.argv[2].strip()
    ################################################################################

    ################################################################################
    cp = ClustersProcessor(clusters_path)
    t0 = time()
    cp.process()
    print "Loaded", len(cp.clusters), "clusters."
    ################################################################################

    ################################################################################
    if start_index is None and end_index is None:
        print "Dropping previous DB"
        try:
            mdb.drop()
            mdb = mdb_conn.queries.clustering
        except Exception as err:
            print type(err), err
    else:
        if start_index is None:
            # Remove everything up to the end_index
            where_clause = {'_id': {'$lte': end_index}}
        elif end_index is None:
            # Remove everything starting at start_index
            where_clause = {'_id': {'$gte': start_index}}
        else:
            # Remove everything between the bounds
            where_clause = {'$and': [{'_id': {'$gte': start_index}}, {'_id': {'$lte': end_index}}] }
        print "Removing documents with following where_clause=", where_clause
        mdb.remove(where_clause)
    ################################################################################
        

    ################################################################################
    t0 = time()
    # Note: 40k seems to be the limit, more than that and MongoDB will say "query is too large"
    batch_size = 40000
    start = 0
    end = 0
    with univ_open(queries_id_mapping_filepath, mode='r') as f:
        queries_vectors = []
        i = 0
        for line in f:
            # Skip everything up to the starting index
            if start_index is not None and i < start_index:
                i += 1
                continue
            # hop, we reached the end index, break there
            if end_index is not None and i > end_index:
                break
 
            queries_vectors.append(cp.cluster_vector_for_kw(line))
            
            i += 1
            if i % batch_size is 0:
                start = i - batch_size
                end = i
                sys.stdout.write("Committing batch %d %d...\t" % (start, end))
                sys.stdout.flush()
                insert_with_ids_range(mdb, queries_vectors, start, end)
                print "done."
                queries_vectors = [] # GC?
        # end for line in f

    # if len(queries_vectors) was not a multiple of batch_size, let us execute the last batch:
    # not that the variable "end" will still have the last value that was assigned to it:
    # either 0 is the batch size is greater than the total size of data
    # or the last index that was committed
    if end is not len(queries_vectors)-1:
        print "Last batch..."
        insert_with_ids_range(mdb, queries_vectors, end, None)
    print "Committed", i, "vectors in", time()-t0
    ################################################################################

    ################################################################################
    print "No index creations needed"
    ################################################################################

    ################################################################################
    print "Closing MDB connection"
    mdb_conn.close()
    ################################################################################
    print "Done in", time()-t0
    print "Terminating script."

Example #20

0

Show file

File: store_similar_users.py Project: tdubourg/collaborative-personalized-pagerank-public

def compute_everything(host, queries_filter_file, allowed_users_file, top_n=N_QUERIES, null_cluster_norm_threshold=ZERO_FLOAT):
    """
        This function will take as input a list of allowed queries and users and
        output the users and queries after filtering the ones (users and queries) that have null clustering
        vector + will output some other useful information..

        Inputs:
            see CLI arguments corresponding to most of the arguments of this method.
            top_n is maximum the number of queries that we keep, after having pruned the ones with null clustering vectors
            null_cluster_norm_threshold the is floating point threshold under which we consider a clustering vector
                to be null
        Outputs:
            users: the set of users that both queried at least once a query of queries_filter_file 
                and are in the allowed_users_file set of users
            sums_of_clusters: the sum of all clusters of queries the user queried
            q_list: the list of queries that we kept, from queries_filter_file and 
                by removing queries with null clustering vectors
            clusters: the clusters that we loaded, for the given queries
            removed_queries: queries with null clustering vector that we removed
    """


    from univ_open import univ_open
    from numpy.linalg import norm
    from pymongo import MongoClient
    print "Connecting to MongoDB..."
    mdb_conn = MongoClient(host=host)
    init_mdb(mdb_conn)
    us.init_mdb(mdb_conn)

    print "Parsing filter files..."
    print "Queries list..."
    q_list = [int(line.strip()) for line in univ_open(queries_filter_file, mode='r')]

    print "Loaded", len(q_list), "seed queries."

    print "Retrieving queries clusters..."
    # Retrieve all the clusters of the queries
    clusters = us.clusters(q_list)
    # Check for null vectors... just in case...
    print "Checking for null clusters vectors..."
    removed_queries = []
    for qid, cl in clusters.items():
        if norm(cl) < ZERO_FLOAT:  # Should be precise enough?
            print "Warning, query", qid
            print "Removing it"
            del clusters[qid]
            removed_queries.append(qid)

    print "Previously had", len(q_list), "queries. Now have", len(clusters), "queries"
    print "Taking the top", top_n,"out of them"
    q_list2 = []
    for q in q_list:
        if q in clusters:
            q_list2.append(q)
            if len(q_list2) >= top_n:
                print "Reached the", top_n, "queries"
                break
    q_list = q_list2
    
    print "Allowed users list..."
    allowed_users = set([int(line.strip()) for line in univ_open(allowed_users_file, mode='r')])
    
    print "Parsed user file, processing it..."
    users_queries = {}
    for item in users_who_queried(q_list):
        uid = item['uid']
        if uid in allowed_users:
            try:
                users_queries[uid].append(item['qid'])
            except KeyError:
                users_queries[uid] = [item['qid']]

    # Now, we are going to generate the list of pairs to compute similarity against, by only keeping pairs of users
    # that have at least one cluster in common, that is to say, that issued at least one query that has a cluster in
    # common with at least one query of the other user...
    users = users_queries.keys()
    n_users = len(users)
    # We compute all of them at once instead of doing it in the loop because if we did it in the loop we would end up
    # computing them multiple times (i times for the ith item of the 'users' list)
    print "Computing sums of clusters vectors per user... (", n_users, "users)"

    sums_of_clusters = []
    for u in users:
        try:
            for qid in users_queries[u]:
                try:
                    sums_of_clusters.append(sum(clusters[qid]))
                except KeyError as err:
                    print err
                    print "The qid=", qid, "was not found?!?"
                    exit()
        except KeyError as err:
            print err
            print "The user="******"was not found?!?"
            exit()

    return {
        'users': users,
        'sums_of_clusters': sums_of_clusters,
        'q_list': q_list,
        'clusters': clusters,
        'removed_queries': removed_queries,
    }

Example #21

0

Show file

File: store_users_times_per_query.py Project: tdubourg/collaborative-personalized-pagerank-public

    def process(self, **options):
        # Note: Althought we are not going to use all the indices in this list
        # we are still using a list in order to have fast direct element access
        # in order not to waste memory, though, we initialize everything to None
        # and then only initialize the base dictionary for the elements of the list
        # that are going to be used
        self.user_queries_number = [None] * self.MAX_USER_ID
        for _ in options['allowed_user_ids']:
            self.user_queries_number[_] = {'_id': _}

        try:
            excluded_qids = options['excluded_qids'] if options['excluded_qids'] is not None else set()
        except KeyError:
            excluded_qids = set()

        last_queryid = -1
        with univ_open(self.log_filepath, mode='r') as f:
            i = 0
            for line in f:
                i += 1
                if i % 500000 is 0:
                    print "Currently at line", i
                line = line.strip().split('\t')
                if len(line) is not 5\
                    and len(line) is not 3:  # This is neither a click log line nor it could just be a search log line
                    # Note that if there is no clicks, a "search log line" will appear in the logs
                    # but if there were some clicks, this line will not appear and a "clickthrough log line" (or 
                    # several ones) will appear(s). As a consequence, We cannot simply count the number of "search" type
                    # lines in order to know how many times the user issued the query
                    # we also have to consider the click lines
                    continue
                try:
                    user_id = int(line[0])
                    
                    if user_id in options['allowed_user_ids']:
                        queryid = int(line[1].strip())

                        if queryid in excluded_qids:
                            continue

                        curr_time = mktime(datetime.strptime(line[2].strip(), '%Y-%m-%d %H:%M:%S').timetuple())

                        # When was it the last time that we saw a log line about this query?
                        # Note: clickthrough log lines for the same query are contiguous
                        last_time = self.user_queries_number[user_id] \
                                .setdefault(queryid, (0, float('-inf')))[1]
                        # Has there be enough time since this last time for us to consider that the user
                        # is really re-issuing the query and not just getting back to the SERP and clicking a new 
                        # link
                        # we will also consider that if the user issued another query in between, and then went back
                        # to this query it means it did re-issue this query
                        # note that as the click log lines are groupped by user it seems, we just compare to the last
                        # query id without having a user-specific last query id
                        if abs(curr_time - last_time) >= SAME_QUERY_TIME_INTERVAL or last_queryid is not queryid:
                            self.user_queries_number[user_id][queryid] = (
                                1 + self.user_queries_number[user_id][queryid][0],
                                curr_time
                            )
                        else:
                            # We still update the last time this guy queried this query in order to avoid
                            # someone getting back every 5 seconds to the page 20 times being detected as re-issueing the
                            # query just because the last time we registered was only the first time this guy went on the
                            # page
                            self.user_queries_number[user_id][queryid] = (
                                    # Please note here there is no +1, of course
                                    self.user_queries_number[user_id][queryid][0],
                                    curr_time
                            )

                        # And in any case, we also update the last query that we saw... of course
                        last_queryid = queryid
                except ValueError as err:
                    print "Line number", i, "has invalid user id:", err
                except KeyError as err:
                    print "KeyError: ", err
                    print user_id, queryid
                    exit()

Example #22

0

Show file

File: convert_serp_result_urls_to_ids.py Project: tdubourg/collaborative-personalized-pagerank-public

def main(stop_after_init=False):
    from sys import argv

    argc = len(argv)
    if argc <= len(CLI_ARGS):
        print 'Usage: %s %s %s' % (argv[0], ' '.join(CLI_ARGS), ' '.join(["[%s]" % x for x in OPTIONAL_ARGS]))
        print 'Currently missing parameters arguments:', ' '.join(CLI_ARGS[len(argv)-1:])
        exit()

    serp_result_file            = argv[1].strip()
    url_web_crawl_ids_mapping   = argv[2].strip()
    output_path                 = argv[-1].strip()

    t_init = time()

    print "Loading SERP..."
    with univ_open(serp_result_file, 'r') as f:
        serps = jload(f)
    print "Loaded"

    print "Loading urls-to-ids dict..."
    urls_to_ids = {}
    with univ_open(url_web_crawl_ids_mapping, 'r') as f:
        i = 0
        for line in f:
            line = line.strip().lower().replace("%0a", '')
            urls_to_ids[line] = i
            i += 1
    print "Loaded"

    print "Converting SERP..."
    t0 = time()
    not_converted = set()
    total_urls = set()
    converted_set = set()
    for query_serps in serps.values():
        for serp in query_serps:
            i = 0
            while i < len(serp['results']):
                pos, url = serp['results'][i]
                url = url.lower().replace('%0a', '')
                total_urls.add(url)
                try:
                    serp['results'][i] = (pos, urls_to_ids[url])
                    converted_set.add(url)
                except KeyError as err:
                    # Looks like this URL has not been seen during the web crawl, as it has no assigned ID
                    not_converted.add(url)
                    serp['results'].pop(i)
                    i -= 1
                i += 1
    print "Over", len(total_urls), "total different URLs from the SERP results,", len(not_converted), "could not be converted"
    if len(total_urls) - len(not_converted) < 600:
        print converted_set
    print "Done in", time()-t0

    print "Writing URLs to output file", output_path, "..."
    t0 = time()
    with univ_open(output_path, 'w+') as out:
        jdump(serps, out)
    print "Done in", time()-t0

    print "Script executed in", time() - t_init, "seconds"

Example #23

0

Show file

File: compute_usage_scoring.py Project: tdubourg/collaborative-personalized-pagerank-public

def main(argv, no_compute=False, store_results=True):
    import sys
    # from sys import argv
    from time import time

    argc = len(argv)
    if argc <= len(CLI_ARGS):
        print "Usage: %s"  % argv[0], ' '.join(CLI_ARGS)
        print "Currently missing parameters arguments:", ' '.join(CLI_ARGS[len(argv)-1:])
        exit()

    pairs_path  = argv[1].strip()
    mdb_host    = argv[2].strip()

    t_init = time()

    with univ_open(pairs_path, 'r') as f:
        pairs = [tuple([int(_) for _ in l.strip().split(" ")]) for l in f]

    targeted_users = [p[1] for p in pairs]

    sim_users = load_sim_users(mdb_host, targeted_users)
    all_allowed_users = load_set_of_similar_users(mdb_host, targeted_users) | set(targeted_users)

    # This will load everything needed to compute sim() function in-ram
    similar_users(argv[1:], True, preset_set_of_users=all_allowed_users)

    print "We loaded", len(us.users_clicks_list_indexed_by_uid), "users' clicks information"

    # Init the sim_users array of the user_similiraty module
    us.init_sim_users(sim_users)

    if no_compute:
        return

    # We're ready to compute!

    for q, u in pairs:
        # Note: The collaborative score is not computed on the user's own clicked pages
        # but using the clicks of the users similar to him
        # so we have to compute scores for all pages clicked BY ITS SIMILAR USERS
        for sim, u_sim in sim_users[u]:
            try:
                pages = us.users_clicks_list_indexed_by_uid[u_sim][q].keys()
            except KeyError as err:
                key = int(err.message)
                if key == q and key != u_sim:
                    # Well, this similar user just does not have queried this query...
                    continue
                print "For u_sim, q", u_sim, q
                print "KeyError with value=", err
                continue
            if pages is None:
                print "?!?! The user", u_sim, "user similar to", u, "has no pages information."
            else:
                print "User", u_sim, "user similar to", u, "has", len(pages), "pages informations for query", q
                for page in pages:
                    # try:
                    sys.stdout.write("score(u=%s, q=%s, p=%s)=" % (u, q, page))
                    sys.stdout.flush()
                    sys.stdout.write("%.5e\n" % us.score(q, page, u))
                    # except KeyError as err:
                    #     print "The KeyError is:", err
                    #     print "For some reason, the user", u, "probably is not in the sim_users cache"
                    #     print "Dump of the sim_users cache:"
                    #     print sim_users
                    #     raise err
                    # except Exception as err:
                    #     print "Errors happen, this time it is:", err

    if store_results:
        print "Storing the computed scores in the DB"
        print "We are going to use the cache of the us module"
        print "The cache currently contains", len(us.scores), "entries"
        t0 = time()
        mdb = MongoClient(host=mdb_host)
        scores_vectors = {}
        for (q, p, u), score in us.scores.items():
            if score == 0.0:
                # Nil score is the same as no score
                continue
            try:
                scores_vectors[(u, q)].append((p, score))
            except KeyError:
                scores_vectors[(u, q)]= [(p, score)]
        print "Precomputation took", time()-t0
        print "We have", len(scores_vectors), "'(u, q) -> score' entries to commit to the DB"
        
        print "Committing new one..."
        t0 = time()
        scores_to_commit = []
        for (user, q), scores in scores_vectors.items():
            print "Dropping previous information..."
            mdb.users.urls_perso_scores.remove({'uid': user, 'qid': q})
            scores_to_commit.append(
                {
                    'uid': user,
                    'qid': q,
                    'vector': scores
                } \
            )
        mdb.users.urls_perso_scores.insert(scores_to_commit)
        print "Done, committing took", time()-t0
        print "Stats:"
        for (user, q), scores in scores_vectors.items():
            print "We have", len(scores), "scores for (q, u)=", (q, user)

    print "Script execution took", time()-t_init

Example #24

0

Show file

File: domain_ids_of_web_crawl_urls_ids.py Project: tdubourg/collaborative-personalized-pagerank-public

def main(stop_after_init=False):
    from sys import argv

    argc = len(argv)
    if argc <= len(CLI_ARGS):
        print 'Usage: %s %s %s' % (argv[0], ' '.join(CLI_ARGS), ' '.join(["[%s]" % x for x in OPTIONAL_ARGS]))
        print 'Currently missing parameters arguments:', ' '.join(CLI_ARGS[len(argv)-1:])
        exit()

    urls_from_logs_to_ids_file   = argv[1].strip()
    web_crawl_urls_to_ids_file   = argv[2].strip()
    output_path                  = argv[-1].strip()

    t_init = time()

    print "Loading domains string -> id mapping..."
    t0 = time()
    domains_string_to_ids = {}
    with univ_open(urls_from_logs_to_ids_file, 'r') as f:
        current_index = 0
        for line in f:
            domains_string_to_ids[line.strip().lower().replace("%0a", '')] = current_index
            
            current_index += 1
    print "Done in", time()-t0

    print "Counting urls..."
    t0 = time()
    number_of_urls = 0
    with univ_open(web_crawl_urls_to_ids_file, 'r') as f:
        # Note: I thought I'd use len(f.readlines()) but building this huge list in memory takes ages for nothing
        for l in f:
            number_of_urls += 1

    print "Mapping URLs to their domain id...."
    web_crawl_urls_to_domain_ids = [None] * number_of_urls
    with univ_open(web_crawl_urls_to_ids_file, 'r') as f:
        f.readline()  #1st line has no info
        current_index = 0
        start_index = 0  # The second line contains no comma
        for line in f:
            line = line.strip().lower()
            if line == "]":
                continue
            line = jloads(line[start_index:]).replace("%0a", '')
            start_index = 1
            domain = extract_domain(line)
            try:
                web_crawl_urls_to_domain_ids[current_index] = domains_string_to_ids[domain]
            except KeyError:
                pass  # well, not found, then keep no values to assign
            
            current_index += 1

    print "Done in", time()-t0

    print "Writing URLs to output file", output_path, "..."
    t0 = time()
    with univ_open(output_path, 'w+') as out:
        out.write(
            "\n".join( 
                "%d" % web_crawl_urls_to_domain_ids[i] 
                    if web_crawl_urls_to_domain_ids[i] is not None
                    else ""
                for i in xrange(len(web_crawl_urls_to_domain_ids))
            )
        )
    print "Done in", time()-t0

    print "Script executed in", time() - t_init, "seconds"

Example #25

0

Show file

File: convert_logs_to_url_domain_ids.py Project: tdubourg/collaborative-personalized-pagerank-public

log_filepath =  sys.argv[1].strip()
url_domain_ids_path = sys.argv[2].strip()
output_path = sys.argv[3].strip()

# If True, the resulting output will be a valid JSON list
# Else, it will just be one line = one item list, ordered by index (no huge difference!)
JSON_OUTPUT = False

# Size of IO batch writes
BATCH_SIZE = 10000

from time import time
batch_str = ''
items_in_queue = 0
t0 = time()
with univ_open(output_path, 'wb') as outfile:
    print "Loading url_domain->ids mapping... (brace yourself, RAM!)"
    t0 = time()
    with univ_open(url_domain_ids_path, 'rb') as idmapfile:
        url_domain_ids = {}
        i = 0
        for line in idmapfile:
            url_domain_ids[line.strip()] = str(i)
            i += 1
    print "Done in", time()-t0

    print "Starting logs parsing + I/O to output file", output_path
    t0 = time()
    with univ_open(log_filepath, mode='r') as f:
        for line in f:
            line = line.strip().split('\t')

Example #26

0

Show file

File: mapping_aol_logs_to_serps.py Project: tdubourg/collaborative-personalized-pagerank-public

    def process(self, **options):
        # Note: Althought we are not going to use all the indices in this list
        # we are still using a list in order to have fast direct element access
        # in order not to waste memory, though, we initialize everything to None
        # and then only initialize at 0 the users that we encounter in the logs
        self.serps_to_logs_mapping = {}
        self.logs_clicked_domain = {}

        try:
            excluded_qids = options['excluded_qids'] if options['excluded_qids'] is not None else set()
        except KeyError:
            excluded_qids = set()

        web_crawl_urls_to_domain_ids    = options['web_crawl_urls_to_domain_ids']
        query_str_to_ids                = options['query_str_to_ids']
        serps                           = options['serps']
        try:
            allowed_queries             = options['allowed_queries']
        except KeyError:
            allowed_queries             = None
            print "Warning, none set of allowed queries"


        with univ_open(self.log_filepath, mode='r') as f:
            i = 0
            for line in f:
                i += 1
                if i % 500000 is 0:
                    print "Currently at line", i
                line = line.strip().split('\t')
                if len(line) is not 5:  # This is not a click log line (it could just be a search log line)
                    continue
                try:
                    user_id = int(line[0])
                    queryid = int(line[1].strip())
                    position = int(line[3].strip())
                    url_domain_id = int(line[4].strip())
                except ValueError as err:
                    print i, "has invalid user, query, position or url_domain id:", err
                except KeyError as err:
                    print "KeyError: ", err
                    print queryid, url_domain_id, user_id
                    exit()
                    
                if user_id not in options['allowed_users']:
                    continue

                if (allowed_queries is not None and queryid not in allowed_queries) or queryid in excluded_qids:
                    continue
                
                self.logs_clicked_domain.setdefault(queryid, {}).setdefault(position, []).append(url_domain_id)

        print "Result of clicks (pos, domain) gathering:"
        print self.logs_clicked_domain

        clicks = list()
        urls_serps = list()
        mapped = list()
        for queryid, click_entries in self.logs_clicked_domain.items():
            query_str = query_str_to_ids[queryid]
            if query_str not in serps:
                print "[INFO] Query", query_str, "was not in the serps results"
                continue
            for position, domain_ids in sorted(click_entries.items()):
                for domain_id in domain_ids:
                    clicks.append(domain_id)
                    for serp in serps[query_str]:
                        i = 0 
                        while i < len(serp['results']):
                            pos, urlid = serp['results'][i]
                            urls_serps.append(urlid)
                            urlid_domain_id = web_crawl_urls_to_domain_ids[urlid]
                            if domain_id == urlid_domain_id:
                                mapped.append((queryid, urlid))
                                if (queryid, urlid) in self.serps_to_logs_mapping:
                                    print (queryid, urlid), "was already mapped to", self.serps_to_logs_mapping[(queryid, urlid)]
                                    print "Remapping it to", domain_id
                                self.serps_to_logs_mapping[(queryid, urlid)] = domain_id
                                serp['results'].pop(i)  # popping it so that we do not insert it twice
                                i -= 1
                            i += 1

        print "We went through", len(set(clicks)), "different domain ids (", \
            len(clicks), "total (domain, pos) entries)" \
            , "from the logs,", \
            len(set(urls_serps)), "different urlids (", len(urls_serps), "(pos,urlids) entries in total)", \
            "from the SERPs and we mapped", len(mapped), "URLs,", len(set(mapped)), "different ones"

Example #27

0

Show file

File: mapping_aol_logs_to_serps.py Project: tdubourg/collaborative-personalized-pagerank-public

def main():
    # TODO: 
    # load the requeried SERPs and store them as follows:
    # (query_id, [complete_list_of_urls_from_serps_in_order_of_appearance])
    # then when we go through the users' clicks, we do:
    # index = urls_from_serps[queryid].find(current_domain, lamdba: extract_domain())
    # urls_from_serps.pop(index)
    # somehow get the info about the domain ID in the logs, make the mapping, etc. ...
    import sys
    from sys import argv
    t_init = time()

    argc = len(sys.argv)

    if argc < (len(CLI_ARGS)+1):
        print "Usage:", argv[0], " ".join(CLI_ARGS), " ".join(OPT_ARGS)
        exit()

    logfile                                         = argv[1]
    mdb_host                                        = argv[2]
    queries_users_file                              = argv[3]
    query_str_to_ids_mapping_file                   = argv[4]
    serp_requery_result_file                        = argv[5]
    web_crawl_graph_url_to_domain_ids_mapping_file  = argv[6]
    output_path                                     = argv[-1]

    print "Loading target users..."
    queries_users_pairs = [tuple(map(int, _.strip().split(' '))) for _ in univ_open(queries_users_file, 'r')]
    targeted_users_set = set([_[1] for _ in queries_users_pairs])
    targeted_queries_set = set([_[0] for _ in queries_users_pairs])

    print "Loaded", len(targeted_users_set), "target users."

    print "Loading the set of their top similar users..."
    set_of_similar_users = load_set_of_similar_users(mdb_host, targeted_users_set)
    print "Loaded a total of", len(set_of_similar_users), "new allowed users"
    allowed_users = set_of_similar_users | targeted_users_set
    print len(allowed_users), "users allowed in total"

    print "Loading SERP file..."
    t0 = time()
    serps = jload(univ_open(serp_requery_result_file, 'r'))
    print "Done in", time() - t0

    print "Loading IDs <-> query strings..."
    t0 = time()
    queries_str_indexed_by_ids = [_.strip() for _ in univ_open(query_str_to_ids_mapping_file, 'r')]
    print "Done in", time() - t0

    print "Loading (web crawl) URL IDs <-> (logs) domain ids..."
    t0 = time()
    web_crawl_urls_to_domain_ids = [int(_.strip()) if _ != '\n' else None for _ in univ_open(web_crawl_graph_url_to_domain_ids_mapping_file, 'r')]
    print "Done in", time() - t0

    lp = LogProcessor(logfile)
    t0 = time()
    print "Starting process..."
    lp.process(
        serps=serps,
        allowed_queries=targeted_queries_set,
        allowed_users=allowed_users,
        query_str_to_ids=queries_str_indexed_by_ids,
        web_crawl_urls_to_domain_ids=web_crawl_urls_to_domain_ids
    )
    print "Done in", time() - t0

    print "Outputting result to", output_path, "..."
    with open(output_path, 'w+') as out:
        # q, urlid, domainid
        # meaning (q, urlid) -> domainid
        out.write('\n'.join(("%d,%d,%d" % (item[0][0], item[0][1], item[1]) for item in lp.serps_to_logs_mapping.items())))

    print "Done in", time() - t_init
    print "Terminating script."

Example #28

0

Show file

File: check_how_many_total_queries_for_x_targeted_users.py Project: tdubourg/collaborative-personalized-pagerank-public

clusters = load_clusters_for_queries(
    qs,
    clusters_path,
    queries_id_mapping_filepath
)

print "Computing the queries that we can remove..."

removed_queries = compute_removed_queries_because_of_null_clustering('/tmp/blorg', clusters)

print "We can remove", len(removed_queries), "from this set"

final_set = qs - removed_queries 
print "That makes the total set of size:", len(final_set)

# Now get the real query strings:
print "Converting from queryids back to query strings..."
query_strings_set = set()
with univ_open(queries_id_mapping_filepath, mode='r') as f:
    queryid = 0
    for l in f:
        if queryid in final_set:
            query_strings_set.add(l.strip().lower())
        queryid += 1  # lines numbers are the ids of the queries

output_path = pjoin(HOME, 'cppr/data/serp_crawl_queries_to_be_reissued_similar_user_profiles_queries.lst')
print "Outputting result to", output_path, "..."
with univ_open(output_path, mode='w+') as out:
    out.write('\n'.join(query_strings_set))

print "Done"

Example #29

0

Show file

File: load_crawl_to_logs_urls_mapping.py Project: tdubourg/collaborative-personalized-pagerank-public

def load(path):
    with univ_open(path, 'r') as f:
        return dict(
            (map(int, l.strip().split(' ')) for l in f)
        )

Example #30

0

Show file

File: pickle_utils.py Project: tdubourg/collaborative-personalized-pagerank-public

def pickle_open_and_write(pickle_path, data, dump_f=pdump):
    t0 = time()
    with univ_open(pickle_path, 'wb+') as f:
        dump_f(data, f)
    print "Done ", time()-t0