Ejemplo n.º 1
0
def process_file(known_queries, file):
    count = 0
    out_count = 1
    out = "%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d"
    for e in yu.avro_reader(file):
        count+=1
        if count == 100001:
            break
        if count % 10000 == 0:
            print >>sys.stderr,"Count", count,"out", out_count
            sys.stderr.flush()
        for fp in yu.get_clickstream_fingerprint(known_queries, e):
            print out % (fp[0][0], fp[1][0], fp[2][0], fp[3][0], fp[1][1], fp[2][1], fp[3][1], fp[1][2], fp[2][2], fp[3][2])
            out_count+=1
    return count
Ejemplo n.º 2
0
def process_file(query_hashes, file):
    counter = Counter()
    c = 0
    s = 0
    o = 0
    for e in yu.avro_reader(file):
        s+=1
        if s == 100000:
            break
        for q in e['queries']:
            c+=1
            if c % 10000 == 0:
                print >>sys.stderr,"Queries:", c, "Sessions:", s, "Out", o
            if q['queryHash'] not in query_hashes:
                continue
            o+=1
            for pos, url in enumerate(q['urls']):
                pos_marker = 0 if pos < 5 else 1
                key = "%d\t%d\t%d\t%d" % (q['queryHash'], e['regionId'], url, pos_marker)
                counter[key]+=1
    print >>sys.stderr,"Queries:", c, "Sessions:", s, "Out", o
    dump_cache(counter)
Ejemplo n.º 3
0
def main(args):
    known_queries = yu.read_judged_queries(args[0])
    for f in args[1:]:
        process(yu.avro_reader(f), known_queries)